diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go index cdba3058..6b28ad66 100644 --- a/internal/pkg/postprocessor/assets.go +++ b/internal/pkg/postprocessor/assets.go @@ -9,7 +9,9 @@ import ( "github.com/internetarchive/Zeno/pkg/models" ) -func extractAssets(item *models.Item) (assets []*models.URL, err error) { +// extractAssets extracts assets from the item's body and returns them. +// It also potentially returns outlinks if the body contains URLs that are not assets. +func extractAssets(item *models.Item) (assets, outlinks []*models.URL, err error) { var ( contentType = item.GetURL().GetResponse().Header.Get("Content-Type") logger = log.NewFieldedLogger(&log.Fields{ @@ -25,57 +27,62 @@ func extractAssets(item *models.Item) (assets []*models.URL, err error) { INAAssets, err := ina.ExtractMedias(item.GetURL()) if err != nil { logger.Error("unable to extract medias from INA", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } HTMLAssets, err := extractor.HTMLAssets(item) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } assets = append(INAAssets, HTMLAssets...) case truthsocial.NeedExtraction(item.GetURL()): - assets, err = truthsocial.ExtractAssets(item) + assets, outlinks, err = truthsocial.ExtractAssets(item) if err != nil { logger.Error("unable to extract assets from TruthSocial", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } case extractor.IsM3U8(item.GetURL()): assets, err = extractor.M3U8(item.GetURL()) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } case extractor.IsJSON(item.GetURL()): - assets, err = extractor.JSON(item.GetURL()) + assets, outlinks, err = extractor.JSON(item.GetURL()) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } case extractor.IsXML(item.GetURL()): - assets, err = extractor.XML(item.GetURL()) + assets, outlinks, err = extractor.XML(item.GetURL()) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } case extractor.IsHTML(item.GetURL()): assets, err = extractor.HTMLAssets(item) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item", item.GetShortID()) - return assets, err + return assets, outlinks, err } default: logger.Debug("no extractor used for page", "content-type", contentType, "item", item.GetShortID()) - return assets, nil + return assets, outlinks, nil } - // Set the hops level to the item's level + // For assets, set the hops level to the item's level for _, asset := range assets { asset.SetHops(item.GetURL().GetHops()) } - return assets, nil + // For outlinks, set the hops level to the item's level + 1 + for _, outlink := range outlinks { + outlink.SetHops(item.GetURL().GetHops() + 1) + } + + return assets, outlinks, nil } func shouldExtractAssets(item *models.Item) bool { diff --git a/internal/pkg/postprocessor/extractor/json.go b/internal/pkg/postprocessor/extractor/json.go index ad90278c..941702b9 100644 --- a/internal/pkg/postprocessor/extractor/json.go +++ b/internal/pkg/postprocessor/extractor/json.go @@ -2,8 +2,8 @@ package extractor import ( "encoding/json" - "net/url" + "github.com/ImVexed/fasturl" "github.com/internetarchive/Zeno/pkg/models" ) @@ -11,27 +11,34 @@ func IsJSON(URL *models.URL) bool { return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json") } -func JSON(URL *models.URL) (assets []*models.URL, err error) { +func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) { defer URL.RewindBody() bodyBytes := make([]byte, URL.GetBody().Len()) _, err = URL.GetBody().Read(bodyBytes) if err != nil { - return nil, err + return nil, nil, err } - rawAssets, err := GetURLsFromJSON(bodyBytes) + rawURLs, err := GetURLsFromJSON(bodyBytes) if err != nil { - return nil, err + return nil, nil, err } - for _, rawAsset := range rawAssets { - assets = append(assets, &models.URL{ - Raw: rawAsset, - }) + // We only consider as assets the URLs in which we can find a file extension + for _, rawURL := range rawURLs { + if hasFileExtension(rawURL) { + assets = append(assets, &models.URL{ + Raw: rawURL, + }) + } else { + outlinks = append(outlinks, &models.URL{ + Raw: rawURL, + }) + } } - return assets, err + return assets, outlinks, nil } func GetURLsFromJSON(body []byte) ([]string, error) { @@ -65,6 +72,6 @@ func findURLs(data interface{}, links *[]string) { } func isValidURL(str string) bool { - u, err := url.Parse(str) - return err == nil && u.Scheme != "" && u.Host != "" + u, err := fasturl.ParseURL(str) + return err == nil && u.Host != "" } diff --git a/internal/pkg/postprocessor/extractor/json_test.go b/internal/pkg/postprocessor/extractor/json_test.go index 6330af85..6426432a 100644 --- a/internal/pkg/postprocessor/extractor/json_test.go +++ b/internal/pkg/postprocessor/extractor/json_test.go @@ -74,7 +74,7 @@ func TestJSON(t *testing.T) { t.Errorf("ProcessBody() error = %v", err) } - gotURLs, err := JSON(URL) + assets, _, err := JSON(URL) if (err != nil) != tt.wantErr { t.Errorf("JSON() error = %v, wantErr %v", err, tt.wantErr) @@ -82,16 +82,16 @@ func TestJSON(t *testing.T) { } // Sort both slices before comparison - sortURLs(gotURLs) + sortURLs(assets) sortURLs(tt.wantURLs) - if len(gotURLs) != len(tt.wantURLs) { - t.Fatalf("Expected %d URLs, got %d", len(tt.wantURLs), len(gotURLs)) + if len(assets) != len(tt.wantURLs) { + t.Fatalf("Expected %d URLs, got %d", len(tt.wantURLs), len(assets)) } - for i := range gotURLs { - if gotURLs[i].Raw != tt.wantURLs[i].Raw { - t.Errorf("Expected URL %s, got %s", tt.wantURLs[i].Raw, gotURLs[i].Raw) + for i := range assets { + if assets[i].Raw != tt.wantURLs[i].Raw { + t.Errorf("Expected URL %s, got %s", tt.wantURLs[i].Raw, assets[i].Raw) } } }) diff --git a/internal/pkg/postprocessor/extractor/utils.go b/internal/pkg/postprocessor/extractor/utils.go index 18d76800..5c3bd70b 100644 --- a/internal/pkg/postprocessor/extractor/utils.go +++ b/internal/pkg/postprocessor/extractor/utils.go @@ -1,7 +1,6 @@ package extractor import ( - "net/url" "regexp" "sort" "strings" @@ -17,41 +16,42 @@ var ( AssetsRegex = `(?i)\b(?:src|href)=["']([^"']+\.(?:css|js|png|jpg|jpeg|gif|svg|webp|woff|woff2|ttf|eot))["']` ) -func isContentType(header, targetContentType string) bool { - // Lowercase the header and target content type for case-insensitive comparison - header = strings.ToLower(header) - targetContentType = strings.ToLower(targetContentType) - - return strings.Contains(header, targetContentType) -} - -// compareURLs compares two slices of *url.URL -func compareURLs(a, b []*url.URL) bool { - if len(a) != len(b) { - return false +// hasFileExtension checks if a URL has a file extension in it. +// It might yield false positives, like https://example.com/super.idea, +// but it's good enough for our purposes. +func hasFileExtension(s string) bool { + // Remove fragment portion (#...) + if i := strings.IndexByte(s, '#'); i != -1 { + s = s[:i] } - - // Create a map to store the count of each URL in slice a - counts := make(map[string]int) - for _, url := range a { - counts[url.String()]++ + // Remove query portion (?...) + if i := strings.IndexByte(s, '?'); i != -1 { + s = s[:i] } - // Decrement the count for each URL in slice b - for _, url := range b { - counts[url.String()]-- + // Keep only the substring after the last slash + if slashPos := strings.LastIndexByte(s, '/'); slashPos != -1 { + s = s[slashPos+1:] } - // Check if any count is non-zero, indicating a mismatch - for _, count := range counts { - if count != 0 { - return false - } + // Find the last '.' in the file name + dotPos := strings.LastIndexByte(s, '.') + if dotPos == -1 || dotPos == len(s)-1 { + // No '.' or '.' is the last character -> no valid extension + return false } return true } +func isContentType(header, targetContentType string) bool { + // Lowercase the header and target content type for case-insensitive comparison + header = strings.ToLower(header) + targetContentType = strings.ToLower(targetContentType) + + return strings.Contains(header, targetContentType) +} + // sortURLs sorts a slice of *url.URL func sortURLs(urls []*models.URL) { sort.Slice(urls, func(i, j int) bool { diff --git a/internal/pkg/postprocessor/extractor/utils_test.go b/internal/pkg/postprocessor/extractor/utils_test.go new file mode 100644 index 00000000..839537b7 --- /dev/null +++ b/internal/pkg/postprocessor/extractor/utils_test.go @@ -0,0 +1,111 @@ +package extractor + +import "testing" + +func TestHasFileExtension(t *testing.T) { + tests := []struct { + name string + input string + want bool + }{ + { + name: "Simple JPG extension", + input: "http://example.com/image.jpg", + want: true, + }, + { + name: "Query param after extension", + input: "https://example.org/dog.png?foo=bar", + want: true, + }, + { + name: "Fragment after extension", + input: "https://test.com/cat.gif#section1", + want: true, + }, + { + name: "No extension at all", + input: "http://example.com/foo", + want: false, + }, + { + name: "Trailing slash after potential extension", + input: "http://example.com/foo.txt/", + want: false, // The extension is not truly at the end + }, + { + name: "Extension deeper in path", + input: "http://example.com/data.txt/archive", + want: false, // The .txt is not the last segment + }, + { + name: "Multiple dots, multiple segments", + input: "http://example.net/backups/data.tar.gz?version=2", + want: true, + }, + { + name: "Hidden file style, no extension (e.g. .htaccess)", + input: "https://example.com/.htaccess", + want: true, + }, + { + name: "Dot at the end only (no extension)", + input: "http://example.org/name.", + want: false, // There's no extension after the final dot + }, + { + name: "Just a plain filename with extension, no slashes", + input: "file.zip", + want: true, + }, + { + name: "Filename with multiple dots in the last segment", + input: "https://example.io/some.dir/my.file.name.txt", + want: true, + }, + { + name: "Parameters but no dot in final segment", + input: "https://example.com/paramCheck?this=that", + want: false, + }, + { + name: "Multiple slashes near the end", + input: "http://example.com/dir/subdir/.hidden/", + want: false, + }, + { + name: "Dot in subdirectory name only", + input: "http://example.com/dir.withdot/filename", + want: false, + }, + { + name: "Extension is the last item plus fragment", + input: "http://example.com/test.db#backup", + want: true, + }, + { + name: "No slash, no dot, random string", + input: "thisIsJustAString", + want: false, + }, + { + name: "Multiple dots in final segment with a trailing query", + input: "http://example.com/foo.bar.baz.qux?stuff=1", + want: true, + }, + { + name: "Extension disguised with a slash in the query", + input: "http://example.com/data.zip?path=/etc/passwd", + want: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := hasFileExtension(tt.input) + if got != tt.want { + t.Errorf("hasFileExtension(%q) = %v; want %v", tt.input, got, tt.want) + } + }) + } +} diff --git a/internal/pkg/postprocessor/extractor/xml.go b/internal/pkg/postprocessor/extractor/xml.go index e1eac8c8..11f37d4a 100644 --- a/internal/pkg/postprocessor/extractor/xml.go +++ b/internal/pkg/postprocessor/extractor/xml.go @@ -28,23 +28,23 @@ func IsSitemapXML(URL *models.URL) bool { return isContentType(URL.GetResponse().Header.Get("Content-Type"), "xml") && bytes.Contains(xmlBody, sitemapMarker) } -func XML(URL *models.URL) (assets []*models.URL, err error) { +func XML(URL *models.URL) (assets, outlinks []*models.URL, err error) { defer URL.RewindBody() xmlBody, err := io.ReadAll(URL.GetBody()) if err != nil { - return nil, err + return nil, nil, err } if len(xmlBody) == 0 { - return nil, errors.New("empty XML body") + return nil, nil, errors.New("empty XML body") } decoder := xml.NewDecoder(bytes.NewReader(xmlBody)) decoder.Strict = false var tok xml.Token - var rawAssets []string + var rawURLs []string for { tok, err = decoder.RawToken() @@ -55,31 +55,38 @@ func XML(URL *models.URL) (assets []*models.URL, err error) { if err != nil { // return URLs we got so far when error occurs - return assets, err + return assets, outlinks, err } switch tok := tok.(type) { case xml.StartElement: for _, attr := range tok.Attr { if strings.HasPrefix(attr.Value, "http") { - rawAssets = append(rawAssets, attr.Value) + rawURLs = append(rawURLs, attr.Value) } } case xml.CharData: if bytes.HasPrefix(tok, []byte("http")) { - rawAssets = append(rawAssets, string(tok)) + rawURLs = append(rawURLs, string(tok)) } else { // Try to extract URLs from the text - rawAssets = append(rawAssets, utils.DedupeStrings(LinkRegexRelaxed.FindAllString(string(tok), -1))...) + rawURLs = append(rawURLs, utils.DedupeStrings(LinkRegexRelaxed.FindAllString(string(tok), -1))...) } } } - for _, rawAsset := range rawAssets { - assets = append(assets, &models.URL{ - Raw: rawAsset, - }) + // We only consider as assets the URLs in which we can find a file extension + for _, rawURL := range rawURLs { + if hasFileExtension(rawURL) { + assets = append(assets, &models.URL{ + Raw: rawURL, + }) + } else { + outlinks = append(outlinks, &models.URL{ + Raw: rawURL, + }) + } } - return assets, nil + return assets, outlinks, nil } diff --git a/internal/pkg/postprocessor/extractor/xml_test.go b/internal/pkg/postprocessor/extractor/xml_test.go index 92681fb8..4896a5ff 100644 --- a/internal/pkg/postprocessor/extractor/xml_test.go +++ b/internal/pkg/postprocessor/extractor/xml_test.go @@ -135,18 +135,21 @@ func TestXML(t *testing.T) { t.Errorf("ProcessBody() error = %v", err) } - assets, err := XML(URL) + assets, outlinks, err := XML(URL) + + URLs := append(assets, outlinks...) + if (err != nil) != tt.hasError { t.Fatalf("XML() error = %v, wantErr %v", err, tt.hasError) } - if len(assets) != len(tt.expected) { - t.Fatalf("Expected %d assets, got %d", len(tt.expected), len(assets)) + if len(URLs) != len(tt.expected) { + t.Fatalf("Expected %d assets, got %d", len(tt.expected), len(URLs)) } - for i, asset := range assets { - if asset.Raw != tt.expected[i] { - t.Errorf("Expected asset %s, got %s", tt.expected[i], asset.Raw) + for i, URL := range URLs { + if URL.Raw != tt.expected[i] { + t.Errorf("Expected asset %s, got %s", tt.expected[i], URL.Raw) } } }) diff --git a/internal/pkg/postprocessor/item.go b/internal/pkg/postprocessor/item.go index 7fac0d17..596f6c20 100644 --- a/internal/pkg/postprocessor/item.go +++ b/internal/pkg/postprocessor/item.go @@ -84,9 +84,14 @@ func postprocessItem(item *models.Item) []*models.Item { if item.GetURL().GetResponse() != nil && item.GetURL().GetResponse().StatusCode == 200 { logger.Debug("item is a success", "item_id", item.GetShortID()) + var outlinksFromAssets []*models.URL + // Extract assets from the page if shouldExtractAssets(item) { - assets, err := extractAssets(item) + var assets []*models.URL + var err error + + assets, outlinksFromAssets, err = extractAssets(item) if err != nil { logger.Error("unable to extract assets", "err", err.Error(), "item_id", item.GetShortID()) } else { @@ -113,6 +118,9 @@ func postprocessItem(item *models.Item) []*models.Item { if err != nil { logger.Error("unable to extract outlinks", "err", err.Error(), "item_id", item.GetShortID()) } else { + // Append the outlinks found from the assets + newOutlinks = append(newOutlinks, outlinksFromAssets...) + for i := range newOutlinks { if newOutlinks[i] == nil { logger.Warn("nil link", "item_id", item.GetShortID()) diff --git a/internal/pkg/postprocessor/outlinks.go b/internal/pkg/postprocessor/outlinks.go index 79de658c..82058c1d 100644 --- a/internal/pkg/postprocessor/outlinks.go +++ b/internal/pkg/postprocessor/outlinks.go @@ -47,11 +47,17 @@ func extractOutlinks(item *models.Item) (outlinks []*models.URL, err error) { return outlinks, err } case extractor.IsSitemapXML(item.GetURL()): - outlinks, err = extractor.XML(item.GetURL()) + var assets []*models.URL + + assets, outlinks, err = extractor.XML(item.GetURL()) if err != nil { logger.Error("unable to extract outlinks", "err", err.Error(), "item", item.GetShortID()) return outlinks, err } + + // Here we don't care about the difference between assets and outlinks, + // we just want to extract all the URLs from the sitemap + outlinks = append(outlinks, assets...) case extractor.IsHTML(item.GetURL()): outlinks, err := extractor.HTMLOutlinks(item) if err != nil { diff --git a/internal/pkg/postprocessor/sitespecific/truthsocial/truthsocial.go b/internal/pkg/postprocessor/sitespecific/truthsocial/truthsocial.go index 7e7eb5af..b8e4e9b4 100644 --- a/internal/pkg/postprocessor/sitespecific/truthsocial/truthsocial.go +++ b/internal/pkg/postprocessor/sitespecific/truthsocial/truthsocial.go @@ -13,39 +13,38 @@ var ( usernameRegex = regexp.MustCompile(`^https?:\/\/truthsocial\.com\/@([^/]+)`) statusesRegex = regexp.MustCompile(`^https?:\/\/truthsocial\.com\/api\/v1\/statuses\/\d+$`) accountLookupRegex = regexp.MustCompile(`^https?:\/\/truthsocial\.com\/api\/v1\/accounts\/lookup\?acct=[a-zA-Z0-9]+$`) - truthsocialRegex = regexp.MustCompile(`^https?:\/\/truthsocial\.com\/.*`) ) func NeedExtraction(URL *models.URL) bool { return IsStatusesURL(URL) || IsPostURL(URL) } -func ExtractAssets(item *models.Item) (assets []*models.URL, err error) { +func ExtractAssets(item *models.Item) (assets, outlinks []*models.URL, err error) { if IsStatusesURL(item.GetURL()) { truthsocialAssets, err := GenerateVideoURLsFromStatusesAPI(item.GetURL()) if err != nil { - return assets, err + return assets, outlinks, err } - JSONAssets, err := extractor.JSON(item.GetURL()) + JSONAssets, outlinks, err := extractor.JSON(item.GetURL()) if err != nil { - return assets, err + return assets, outlinks, err } assets = append(truthsocialAssets, JSONAssets...) } else if IsPostURL(item.GetURL()) { truthsocialAssets, err := GeneratePostAssetsURLs(item.GetURL()) if err != nil { - return assets, err + return assets, outlinks, err } HTMLAssets, err := extractor.HTMLAssets(item) if err != nil { - return assets, err + return assets, outlinks, err } assets = append(truthsocialAssets, HTMLAssets...) } - return assets, nil + return assets, outlinks, err }