add: check if assets are truly files when extracting XML & JSON, if n…

…ot then return as outlinks
internetarchive · Jan 29, 2025 · bbef271 · bbef271
1 parent 8b55ea8
commit bbef271
Show file tree

Hide file tree

Showing 10 changed files with 236 additions and 88 deletions.
diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go
@@ -9,7 +9,9 @@ import (
 	"github.com/internetarchive/Zeno/pkg/models"
 )
 
-func extractAssets(item *models.Item) (assets []*models.URL, err error) {
+// extractAssets extracts assets from the item's body and returns them.
+// It also potentially returns outlinks if the body contains URLs that are not assets.
+func extractAssets(item *models.Item) (assets, outlinks []*models.URL, err error) {
 	var (
 		contentType = item.GetURL().GetResponse().Header.Get("Content-Type")
 		logger      = log.NewFieldedLogger(&log.Fields{
@@ -25,57 +27,62 @@ func extractAssets(item *models.Item) (assets []*models.URL, err error) {
 		INAAssets, err := ina.ExtractMedias(item.GetURL())
 		if err != nil {
 			logger.Error("unable to extract medias from INA", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 
 		HTMLAssets, err := extractor.HTMLAssets(item)
 		if err != nil {
 			logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 
 		assets = append(INAAssets, HTMLAssets...)
 	case truthsocial.NeedExtraction(item.GetURL()):
-		assets, err = truthsocial.ExtractAssets(item)
+		assets, outlinks, err = truthsocial.ExtractAssets(item)
 		if err != nil {
 			logger.Error("unable to extract assets from TruthSocial", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 	case extractor.IsM3U8(item.GetURL()):
 		assets, err = extractor.M3U8(item.GetURL())
 		if err != nil {
 			logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 	case extractor.IsJSON(item.GetURL()):
-		assets, err = extractor.JSON(item.GetURL())
+		assets, outlinks, err = extractor.JSON(item.GetURL())
 		if err != nil {
 			logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 	case extractor.IsXML(item.GetURL()):
-		assets, err = extractor.XML(item.GetURL())
+		assets, outlinks, err = extractor.XML(item.GetURL())
 		if err != nil {
 			logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 	case extractor.IsHTML(item.GetURL()):
 		assets, err = extractor.HTMLAssets(item)
 		if err != nil {
 			logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID())
-			return assets, err
+			return assets, outlinks, err
 		}
 	default:
 		logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID())
-		return assets, nil
+		return assets, outlinks, nil
 	}
 
-	// Set the hops level to the item's level
+	// For assets, set the hops level to the item's level
 	for _, asset := range assets {
 		asset.SetHops(item.GetURL().GetHops())
 	}
 
-	return assets, nil
+	// For outlinks, set the hops level to the item's level + 1
+	for _, outlink := range outlinks {
+		outlink.SetHops(item.GetURL().GetHops() + 1)
+	}
+
+	return assets, outlinks, nil
 }
 
 func shouldExtractAssets(item *models.Item) bool {

diff --git a/internal/pkg/postprocessor/extractor/json.go b/internal/pkg/postprocessor/extractor/json.go
@@ -2,36 +2,43 @@ package extractor
 
 import (
 	"encoding/json"
-	"net/url"
 
+	"github.com/ImVexed/fasturl"
 	"github.com/internetarchive/Zeno/pkg/models"
 )
 
 func IsJSON(URL *models.URL) bool {
 	return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json")
 }
 
-func JSON(URL *models.URL) (assets []*models.URL, err error) {
+func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) {
 	defer URL.RewindBody()
 
 	bodyBytes := make([]byte, URL.GetBody().Len())
 	_, err = URL.GetBody().Read(bodyBytes)
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
-	rawAssets, err := GetURLsFromJSON(bodyBytes)
+	rawURLs, err := GetURLsFromJSON(bodyBytes)
 	if err != nil {
-		return nil, err
+		return nil, nil, err
 	}
 
-	for _, rawAsset := range rawAssets {
-		assets = append(assets, &models.URL{
-			Raw: rawAsset,
-		})
+	// We only consider as assets the URLs in which we can find a file extension
+	for _, rawURL := range rawURLs {
+		if hasFileExtension(rawURL) {
+			assets = append(assets, &models.URL{
+				Raw: rawURL,
+			})
+		} else {
+			outlinks = append(outlinks, &models.URL{
+				Raw: rawURL,
+			})
+		}
 	}
 
-	return assets, err
+	return assets, outlinks, nil
 }
 
 func GetURLsFromJSON(body []byte) ([]string, error) {
@@ -65,6 +72,6 @@ func findURLs(data interface{}, links *[]string) {
 }
 
 func isValidURL(str string) bool {
-	u, err := url.Parse(str)
-	return err == nil && u.Scheme != "" && u.Host != ""
+	u, err := fasturl.ParseURL(str)
+	return err == nil && u.Host != ""
 }
diff --git a/internal/pkg/postprocessor/extractor/json_test.go b/internal/pkg/postprocessor/extractor/json_test.go
@@ -74,24 +74,24 @@ func TestJSON(t *testing.T) {
 				t.Errorf("ProcessBody() error = %v", err)
 			}
 
-			gotURLs, err := JSON(URL)
+			assets, _, err := JSON(URL)
 
 			if (err != nil) != tt.wantErr {
 				t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr)
 				return
 			}
 
 			// Sort both slices before comparison
-			sortURLs(gotURLs)
+			sortURLs(assets)
 			sortURLs(tt.wantURLs)
 
-			if len(gotURLs) != len(tt.wantURLs) {
-				t.Fatalf("Expected %d URLs, got %d", len(tt.wantURLs), len(gotURLs))
+			if len(assets) != len(tt.wantURLs) {
+				t.Fatalf("Expected %d URLs, got %d", len(tt.wantURLs), len(assets))
 			}
 
-			for i := range gotURLs {
-				if gotURLs[i].Raw != tt.wantURLs[i].Raw {
-					t.Errorf("Expected URL %s, got %s", tt.wantURLs[i].Raw, gotURLs[i].Raw)
+			for i := range assets {
+				if assets[i].Raw != tt.wantURLs[i].Raw {
+					t.Errorf("Expected URL %s, got %s", tt.wantURLs[i].Raw, assets[i].Raw)
 				}
 			}
 		})

diff --git a/internal/pkg/postprocessor/extractor/utils.go b/internal/pkg/postprocessor/extractor/utils.go
@@ -1,7 +1,6 @@
 package extractor
 
 import (
-	"net/url"
 	"regexp"
 	"sort"
 	"strings"
@@ -17,41 +16,42 @@ var (
 	AssetsRegex      = `(?i)\b(?:src|href)=["']([^"']+\.(?:css|js|png|jpg|jpeg|gif|svg|webp|woff|woff2|ttf|eot))["']`
 )
 
-func isContentType(header, targetContentType string) bool {
-	// Lowercase the header and target content type for case-insensitive comparison
-	header = strings.ToLower(header)
-	targetContentType = strings.ToLower(targetContentType)
-
-	return strings.Contains(header, targetContentType)
-}
-
-// compareURLs compares two slices of *url.URL
-func compareURLs(a, b []*url.URL) bool {
-	if len(a) != len(b) {
-		return false
+// hasFileExtension checks if a URL has a file extension in it.
+// It might yield false positives, like https://example.com/super.idea,
+// but it's good enough for our purposes.
+func hasFileExtension(s string) bool {
+	// Remove fragment portion (#...)
+	if i := strings.IndexByte(s, '#'); i != -1 {
+		s = s[:i]
 	}
-
-	// Create a map to store the count of each URL in slice a
-	counts := make(map[string]int)
-	for _, url := range a {
-		counts[url.String()]++
+	// Remove query portion (?...)
+	if i := strings.IndexByte(s, '?'); i != -1 {
+		s = s[:i]
 	}
 
-	// Decrement the count for each URL in slice b
-	for _, url := range b {
-		counts[url.String()]--
+	// Keep only the substring after the last slash
+	if slashPos := strings.LastIndexByte(s, '/'); slashPos != -1 {
+		s = s[slashPos+1:]
 	}
 
-	// Check if any count is non-zero, indicating a mismatch
-	for _, count := range counts {
-		if count != 0 {
-			return false
-		}
+	// Find the last '.' in the file name
+	dotPos := strings.LastIndexByte(s, '.')
+	if dotPos == -1 || dotPos == len(s)-1 {
+		// No '.' or '.' is the last character -> no valid extension
+		return false
 	}
 
 	return true
 }
 
+func isContentType(header, targetContentType string) bool {
+	// Lowercase the header and target content type for case-insensitive comparison
+	header = strings.ToLower(header)
+	targetContentType = strings.ToLower(targetContentType)
+
+	return strings.Contains(header, targetContentType)
+}
+
 // sortURLs sorts a slice of *url.URL
 func sortURLs(urls []*models.URL) {
 	sort.Slice(urls, func(i, j int) bool {

diff --git a/internal/pkg/postprocessor/extractor/utils_test.go b/internal/pkg/postprocessor/extractor/utils_test.go
@@ -0,0 +1,111 @@
+package extractor
+
+import "testing"
+
+func TestHasFileExtension(t *testing.T) {
+	tests := []struct {
+		name  string
+		input string
+		want  bool
+	}{
+		{
+			name:  "Simple JPG extension",
+			input: "http://example.com/image.jpg",
+			want:  true,
+		},
+		{
+			name:  "Query param after extension",
+			input: "https://example.org/dog.png?foo=bar",
+			want:  true,
+		},
+		{
+			name:  "Fragment after extension",
+			input: "https://test.com/cat.gif#section1",
+			want:  true,
+		},
+		{
+			name:  "No extension at all",
+			input: "http://example.com/foo",
+			want:  false,
+		},
+		{
+			name:  "Trailing slash after potential extension",
+			input: "http://example.com/foo.txt/",
+			want:  false, // The extension is not truly at the end
+		},
+		{
+			name:  "Extension deeper in path",
+			input: "http://example.com/data.txt/archive",
+			want:  false, // The .txt is not the last segment
+		},
+		{
+			name:  "Multiple dots, multiple segments",
+			input: "http://example.net/backups/data.tar.gz?version=2",
+			want:  true,
+		},
+		{
+			name:  "Hidden file style, no extension (e.g. .htaccess)",
+			input: "https://example.com/.htaccess",
+			want:  true,
+		},
+		{
+			name:  "Dot at the end only (no extension)",
+			input: "http://example.org/name.",
+			want:  false, // There's no extension after the final dot
+		},
+		{
+			name:  "Just a plain filename with extension, no slashes",
+			input: "file.zip",
+			want:  true,
+		},
+		{
+			name:  "Filename with multiple dots in the last segment",
+			input: "https://example.io/some.dir/my.file.name.txt",
+			want:  true,
+		},
+		{
+			name:  "Parameters but no dot in final segment",
+			input: "https://example.com/paramCheck?this=that",
+			want:  false,
+		},
+		{
+			name:  "Multiple slashes near the end",
+			input: "http://example.com/dir/subdir/.hidden/",
+			want:  false,
+		},
+		{
+			name:  "Dot in subdirectory name only",
+			input: "http://example.com/dir.withdot/filename",
+			want:  false,
+		},
+		{
+			name:  "Extension is the last item plus fragment",
+			input: "http://example.com/test.db#backup",
+			want:  true,
+		},
+		{
+			name:  "No slash, no dot, random string",
+			input: "thisIsJustAString",
+			want:  false,
+		},
+		{
+			name:  "Multiple dots in final segment with a trailing query",
+			input: "http://example.com/foo.bar.baz.qux?stuff=1",
+			want:  true,
+		},
+		{
+			name:  "Extension disguised with a slash in the query",
+			input: "http://example.com/data.zip?path=/etc/passwd",
+			want:  true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := hasFileExtension(tt.input)
+			if got != tt.want {
+				t.Errorf("hasFileExtension(%q) = %v; want %v", tt.input, got, tt.want)
+			}
+		})
+	}
+}