Skip to content

Commit

Permalink
add: check if assets are truly files when extracting XML & JSON, if n…
Browse files Browse the repository at this point in the history
…ot then return as outlinks
  • Loading branch information
CorentinB committed Jan 29, 2025
1 parent 8b55ea8 commit bbef271
Show file tree
Hide file tree
Showing 10 changed files with 236 additions and 88 deletions.
35 changes: 21 additions & 14 deletions internal/pkg/postprocessor/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ import (
"github.com/internetarchive/Zeno/pkg/models"
)

func extractAssets(item *models.Item) (assets []*models.URL, err error) {
// extractAssets extracts assets from the item's body and returns them.
// It also potentially returns outlinks if the body contains URLs that are not assets.
func extractAssets(item *models.Item) (assets, outlinks []*models.URL, err error) {
var (
contentType = item.GetURL().GetResponse().Header.Get("Content-Type")
logger = log.NewFieldedLogger(&log.Fields{
Expand All @@ -25,57 +27,62 @@ func extractAssets(item *models.Item) (assets []*models.URL, err error) {
INAAssets, err := ina.ExtractMedias(item.GetURL())
if err != nil {
logger.Error("unable to extract medias from INA", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}

HTMLAssets, err := extractor.HTMLAssets(item)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}

assets = append(INAAssets, HTMLAssets...)
case truthsocial.NeedExtraction(item.GetURL()):
assets, err = truthsocial.ExtractAssets(item)
assets, outlinks, err = truthsocial.ExtractAssets(item)
if err != nil {
logger.Error("unable to extract assets from TruthSocial", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}
case extractor.IsM3U8(item.GetURL()):
assets, err = extractor.M3U8(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}
case extractor.IsJSON(item.GetURL()):
assets, err = extractor.JSON(item.GetURL())
assets, outlinks, err = extractor.JSON(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}
case extractor.IsXML(item.GetURL()):
assets, err = extractor.XML(item.GetURL())
assets, outlinks, err = extractor.XML(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}
case extractor.IsHTML(item.GetURL()):
assets, err = extractor.HTMLAssets(item)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
return assets, err
return assets, outlinks, err
}
default:
logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID())
return assets, nil
return assets, outlinks, nil
}

// Set the hops level to the item's level
// For assets, set the hops level to the item's level
for _, asset := range assets {
asset.SetHops(item.GetURL().GetHops())
}

return assets, nil
// For outlinks, set the hops level to the item's level + 1
for _, outlink := range outlinks {
outlink.SetHops(item.GetURL().GetHops() + 1)
}

return assets, outlinks, nil
}

func shouldExtractAssets(item *models.Item) bool {
Expand Down
31 changes: 19 additions & 12 deletions internal/pkg/postprocessor/extractor/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,43 @@ package extractor

import (
"encoding/json"
"net/url"

"github.com/ImVexed/fasturl"
"github.com/internetarchive/Zeno/pkg/models"
)

func IsJSON(URL *models.URL) bool {
return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json")
}

func JSON(URL *models.URL) (assets []*models.URL, err error) {
func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) {
defer URL.RewindBody()

bodyBytes := make([]byte, URL.GetBody().Len())
_, err = URL.GetBody().Read(bodyBytes)
if err != nil {
return nil, err
return nil, nil, err
}

rawAssets, err := GetURLsFromJSON(bodyBytes)
rawURLs, err := GetURLsFromJSON(bodyBytes)
if err != nil {
return nil, err
return nil, nil, err
}

for _, rawAsset := range rawAssets {
assets = append(assets, &models.URL{
Raw: rawAsset,
})
// We only consider as assets the URLs in which we can find a file extension
for _, rawURL := range rawURLs {
if hasFileExtension(rawURL) {
assets = append(assets, &models.URL{
Raw: rawURL,
})
} else {
outlinks = append(outlinks, &models.URL{
Raw: rawURL,
})
}
}

return assets, err
return assets, outlinks, nil
}

func GetURLsFromJSON(body []byte) ([]string, error) {
Expand Down Expand Up @@ -65,6 +72,6 @@ func findURLs(data interface{}, links *[]string) {
}

func isValidURL(str string) bool {
u, err := url.Parse(str)
return err == nil && u.Scheme != "" && u.Host != ""
u, err := fasturl.ParseURL(str)
return err == nil && u.Host != ""
}
14 changes: 7 additions & 7 deletions internal/pkg/postprocessor/extractor/json_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,24 @@ func TestJSON(t *testing.T) {
t.Errorf("ProcessBody() error = %v", err)
}

gotURLs, err := JSON(URL)
assets, _, err := JSON(URL)

if (err != nil) != tt.wantErr {
t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr)
return
}

// Sort both slices before comparison
sortURLs(gotURLs)
sortURLs(assets)
sortURLs(tt.wantURLs)

if len(gotURLs) != len(tt.wantURLs) {
t.Fatalf("Expected %d URLs, got %d", len(tt.wantURLs), len(gotURLs))
if len(assets) != len(tt.wantURLs) {
t.Fatalf("Expected %d URLs, got %d", len(tt.wantURLs), len(assets))
}

for i := range gotURLs {
if gotURLs[i].Raw != tt.wantURLs[i].Raw {
t.Errorf("Expected URL %s, got %s", tt.wantURLs[i].Raw, gotURLs[i].Raw)
for i := range assets {
if assets[i].Raw != tt.wantURLs[i].Raw {
t.Errorf("Expected URL %s, got %s", tt.wantURLs[i].Raw, assets[i].Raw)
}
}
})
Expand Down
52 changes: 26 additions & 26 deletions internal/pkg/postprocessor/extractor/utils.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package extractor

import (
"net/url"
"regexp"
"sort"
"strings"
Expand All @@ -17,41 +16,42 @@ var (
AssetsRegex = `(?i)\b(?:src|href)=["']([^"']+\.(?:css|js|png|jpg|jpeg|gif|svg|webp|woff|woff2|ttf|eot))["']`
)

func isContentType(header, targetContentType string) bool {
// Lowercase the header and target content type for case-insensitive comparison
header = strings.ToLower(header)
targetContentType = strings.ToLower(targetContentType)

return strings.Contains(header, targetContentType)
}

// compareURLs compares two slices of *url.URL
func compareURLs(a, b []*url.URL) bool {
if len(a) != len(b) {
return false
// hasFileExtension checks if a URL has a file extension in it.
// It might yield false positives, like https://example.com/super.idea,
// but it's good enough for our purposes.
func hasFileExtension(s string) bool {
// Remove fragment portion (#...)
if i := strings.IndexByte(s, '#'); i != -1 {
s = s[:i]
}

// Create a map to store the count of each URL in slice a
counts := make(map[string]int)
for _, url := range a {
counts[url.String()]++
// Remove query portion (?...)
if i := strings.IndexByte(s, '?'); i != -1 {
s = s[:i]
}

// Decrement the count for each URL in slice b
for _, url := range b {
counts[url.String()]--
// Keep only the substring after the last slash
if slashPos := strings.LastIndexByte(s, '/'); slashPos != -1 {
s = s[slashPos+1:]
}

// Check if any count is non-zero, indicating a mismatch
for _, count := range counts {
if count != 0 {
return false
}
// Find the last '.' in the file name
dotPos := strings.LastIndexByte(s, '.')
if dotPos == -1 || dotPos == len(s)-1 {
// No '.' or '.' is the last character -> no valid extension
return false
}

return true
}

func isContentType(header, targetContentType string) bool {
// Lowercase the header and target content type for case-insensitive comparison
header = strings.ToLower(header)
targetContentType = strings.ToLower(targetContentType)

return strings.Contains(header, targetContentType)
}

// sortURLs sorts a slice of *url.URL
func sortURLs(urls []*models.URL) {
sort.Slice(urls, func(i, j int) bool {
Expand Down
111 changes: 111 additions & 0 deletions internal/pkg/postprocessor/extractor/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
package extractor

import "testing"

func TestHasFileExtension(t *testing.T) {
tests := []struct {
name string
input string
want bool
}{
{
name: "Simple JPG extension",
input: "http://example.com/image.jpg",
want: true,
},
{
name: "Query param after extension",
input: "https://example.org/dog.png?foo=bar",
want: true,
},
{
name: "Fragment after extension",
input: "https://test.com/cat.gif#section1",
want: true,
},
{
name: "No extension at all",
input: "http://example.com/foo",
want: false,
},
{
name: "Trailing slash after potential extension",
input: "http://example.com/foo.txt/",
want: false, // The extension is not truly at the end
},
{
name: "Extension deeper in path",
input: "http://example.com/data.txt/archive",
want: false, // The .txt is not the last segment
},
{
name: "Multiple dots, multiple segments",
input: "http://example.net/backups/data.tar.gz?version=2",
want: true,
},
{
name: "Hidden file style, no extension (e.g. .htaccess)",
input: "https://example.com/.htaccess",
want: true,
},
{
name: "Dot at the end only (no extension)",
input: "http://example.org/name.",
want: false, // There's no extension after the final dot
},
{
name: "Just a plain filename with extension, no slashes",
input: "file.zip",
want: true,
},
{
name: "Filename with multiple dots in the last segment",
input: "https://example.io/some.dir/my.file.name.txt",
want: true,
},
{
name: "Parameters but no dot in final segment",
input: "https://example.com/paramCheck?this=that",
want: false,
},
{
name: "Multiple slashes near the end",
input: "http://example.com/dir/subdir/.hidden/",
want: false,
},
{
name: "Dot in subdirectory name only",
input: "http://example.com/dir.withdot/filename",
want: false,
},
{
name: "Extension is the last item plus fragment",
input: "http://example.com/test.db#backup",
want: true,
},
{
name: "No slash, no dot, random string",
input: "thisIsJustAString",
want: false,
},
{
name: "Multiple dots in final segment with a trailing query",
input: "http://example.com/foo.bar.baz.qux?stuff=1",
want: true,
},
{
name: "Extension disguised with a slash in the query",
input: "http://example.com/data.zip?path=/etc/passwd",
want: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := hasFileExtension(tt.input)
if got != tt.want {
t.Errorf("hasFileExtension(%q) = %v; want %v", tt.input, got, tt.want)
}
})
}
}
Loading

0 comments on commit bbef271

Please sign in to comment.