Skip to content

Commit

Permalink
Initial check-in of app-ads.txt parsing libraries.
Browse files Browse the repository at this point in the history
  • Loading branch information
cmlight committed Mar 5, 2019
1 parent 807c8e6 commit 9d48d68
Show file tree
Hide file tree
Showing 7 changed files with 734 additions and 1 deletion.
70 changes: 69 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,69 @@
# app-ads.txt-parser
# app-ads.txt URL parsing reference implementation

This repository contains a reference implementation of app-ads.txt parsing utilities, along with test cases for verifying compatibility. These packages are written in the Go software language, although the code and unit tests can be easily ported to another language if needed.

Currently included are two packages:

* `appstoreparse`: an app store listing page HTML parser for extracting HTML `<meta>` tags containing app metadata
* `urlcanonical`: a URL canonicalization package for converting app developer URLs to the corresponding app-ads.txt paths to crawl

The package includes a crawl sample app written in Go, useful for testing compatibility of an individual app store URL from the command line.

## Set up environment

If not already installed, download and install the Go distribution from https://golang.org/

## Running sample app

To run the crawl sample app against an internal sample web server simulating an app listing page HTML file (`sample_app_store.html`), specify the desired port using the `--sample_file_server_port` flag. The app will crawl the app store URL, output the parse results, and then immediately terminate.

```
cd examples/appadstxtcrawl
go run sample_app.go --app_store_url=http://localhost:8081/sample_app_store.html --sample_file_server_port=8081
```

Sample output:

```
Parsed metadata:
Developer URL: https://sample.path.to/page
Bundle ID: com.example.myapp
Store ID: SKU12345
Derived app-ads.txt URLs:
Registerable Domain URL: https://path.to/app-ads.txt
Subdomain URL: https://sample.path.to/app-ads.txt
```

The app can be run against any other app store URL desired. For example, this command line demonstrates crawling an app in Google Play Store.

```
cd examples/appadstxtcrawl
go run sample_app.go --app_store_url=https://play.google.com/store/apps/details?id=com.google.android.apps.maps
```

Sample output:

```
Parsed metadata:
Developer URL: http://maps.google.com/about/
Bundle ID: com.google.android.apps.maps
Store ID: com.google.android.apps.maps
Derived app-ads.txt URLs:
Registerable Domain URL: http://google.com/app-ads.txt
Subdomain URL: http://maps.google.com/app-ads.txt
```

Note: if the app store URL does not provide the required HTML meta tags, the
script will display an empty result such as the following:

```
Parsed metadata:
Developer URL:
Bundle ID:
Store ID:
No developer URL found to parse.
```
87 changes: 87 additions & 0 deletions appstoreparse/parse_app_store_listing.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
package appstoreparse

/*
* Copyright 2018 IAB Tech Lab & OpenRTB Group
* Copyright 2018 Google LLC
*
* Author: Curtis Light, Google
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import (
"fmt"
"strings"

"golang.org/x/net/html"
)

// AppStoreMetadata holds the values parsed from the app listing HTML.
type AppStoreMetadata struct {

// Value found in the appstore:developer_url <meta> tag.
DeveloperURL string

// Value found in the appstore:bundle_id <meta> tag.
BundleID string

// Value found in the appstore:store_id <meta> tag.
StoreID string
}

// ParseAppStorePageHTML accepts a string containing an HTML doc, returning
// a struct containing the parsed "appstore:" meta tag values.
func ParseAppStorePageHTML(htmlContent string) (*AppStoreMetadata, error) {
result := AppStoreMetadata{}
doc, err := html.Parse(strings.NewReader(htmlContent))
if err != nil {
return nil, fmt.Errorf("Error running HTML parser: %v", err)
}
var handleNode func(*html.Node, bool)
handleNode = func(n *html.Node, parentNoteIsHeadTag bool) {
// Ignore any elements contained within the <body> tag.
if n.Type == html.ElementNode && n.Data == "body" {
return
}

if n.Type == html.ElementNode && n.Data == "meta" && parentNoteIsHeadTag {
var nameAttribute, contentAttribute string
for _, a := range n.Attr {
if a.Key == "name" {
nameAttribute = a.Val
}
if a.Key == "content" {
contentAttribute = a.Val
}
}
if nameAttribute == "appstore:developer_url" {
result.DeveloperURL = contentAttribute
}
if nameAttribute == "appstore:bundle_id" {
result.BundleID = contentAttribute
}
if nameAttribute == "appstore:store_id" {
result.StoreID = contentAttribute
}
}

// See if this element is a <head> tag to check if <meta> tags are
// immediate children.
currentNodeIsHeadTag := n.Type == html.ElementNode && n.Data == "head"
for c := n.FirstChild; c != nil; c = c.NextSibling {
handleNode(c, currentNodeIsHeadTag)
}
}
handleNode(doc, false)
return &result, nil
}
155 changes: 155 additions & 0 deletions appstoreparse/parse_app_store_listing_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
package appstoreparse

/*
* Copyright 2018 IAB Tech Lab & OpenRTB Group
* Copyright 2018 Google LLC
*
* Author: Curtis Light, Google
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

import (
"reflect"
"testing"
)

// Demonstrates parsing a complete, valid HTML5 doc with compliant meta tags.
const validBasicHTML = `
<!DOCTYPE html>
<html lang="en">
<head>
<title>
Example app listing
</title>
<meta charset="UTF-8">
<meta name="appstore:developer_url" content="https://www.path.to/page">
<meta name="appstore:bundle_id" content="com.example.myapp">
<meta name="appstore:store_id" content="SKU12345">
</head>
<body>
<h1>
Example app
</h1>
</body>
</html>`

// Confirms that parsing still works correctly with HTML tags and attributes in
// uppercase
const validBasicHTMLUppercase = `
<!DOCTYPE HTML>
<HTML LANG="en">
<HEAD>
<TITLE>
Example app listing
</TITLE>
<META CHARSET="UTF-8">
<META NAME="appstore:developer_url" CONTENT="https://www.path.to/page">
<META NAME="appstore:bundle_id" CONTENT="com.example.myapp">
<META NAME="appstore:store_id" CONTENT="SKU12345">
</HEAD>
<BODY>
<H1>
Example app
</H1>
</BODY>
</HTML>`

// Demonstrates parsing a valid HTML5 doc with compliant meta tags, truncated
// at the closing </head> tag.
const validTruncatedHTML = `
<!DOCTYPE html>
<html lang="en">
<head>
<title>
Example app listing
</title>
<meta charset="UTF-8">
<meta name="appstore:developer_url" content="https://www.path.to/page">
<meta name="appstore:bundle_id" content="com.example.myapp">
<meta name="appstore:store_id" content="SKU12345">
</head>`

// Demonstrates <meta> tags in HTML <body> tag which will be ignored, as the
// tags must reside within the <head> tag.
const metaInBodyNonCompliant = `
<!DOCTYPE html>
<html lang="en">
<head>
<title>
Example app listing
</title>
<meta charset="UTF-8">
</head>
<body>
<meta name="appstore:developer_url" content="https://www.path.to/page">
<meta name="appstore:bundle_id" content="com.example.myapp">
<meta name="appstore:store_id" content="SKU12345">
<h1>
Example app
</h1>
</body>
</html>`

var testScenarios = []struct {
input string
wantMetadata AppStoreMetadata
}{
{
validBasicHTML,
AppStoreMetadata{
DeveloperURL: "https://www.path.to/page",
BundleID: "com.example.myapp",
StoreID: "SKU12345",
},
},
{
validBasicHTMLUppercase,
AppStoreMetadata{
DeveloperURL: "https://www.path.to/page",
BundleID: "com.example.myapp",
StoreID: "SKU12345",
},
},
{
validTruncatedHTML,
AppStoreMetadata{
DeveloperURL: "https://www.path.to/page",
BundleID: "com.example.myapp",
StoreID: "SKU12345",
},
},
{
metaInBodyNonCompliant,
AppStoreMetadata{},
},
{
"", // Empty HTML document
AppStoreMetadata{},
},
}

func TestParseAppStorePageHTML(t *testing.T) {
for _, scenario := range testScenarios {
t.Run(scenario.input, func(t *testing.T) {
haveMetadata, err := ParseAppStorePageHTML(scenario.input)
if !reflect.DeepEqual(*haveMetadata, scenario.wantMetadata) {
t.Errorf("have [%v] want [%v]", haveMetadata, scenario.wantMetadata)
}

if err != nil {
t.Error("Received unexpected error")
}
})
}
}
Loading

0 comments on commit 9d48d68

Please sign in to comment.