diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go index 3a54e600..dcec199d 100644 --- a/internal/pkg/config/config.go +++ b/internal/pkg/config/config.go @@ -12,7 +12,6 @@ import ( "regexp" "strings" "sync" - "time" "github.com/google/uuid" "github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl" @@ -326,18 +325,7 @@ func readLocalExclusionFile(file string) (regexes []string, err error) { } func readRemoteExclusionFile(URL string) (regexes []string, err error) { - httpClient := &http.Client{ - Timeout: time.Second * 5, - } - - req, err := http.NewRequest(http.MethodGet, URL, nil) - if err != nil { - return regexes, err - } - - req.Header.Set("User-Agent", config.UserAgent) - - resp, err := httpClient.Do(req) + resp, err := http.Get(URL) if err != nil { return regexes, err } diff --git a/internal/pkg/postprocessor/extractor/html.go b/internal/pkg/postprocessor/extractor/html.go index aef10be2..c0a29979 100644 --- a/internal/pkg/postprocessor/extractor/html.go +++ b/internal/pkg/postprocessor/extractor/html.go @@ -1,7 +1,6 @@ package extractor import ( - "encoding/json" "regexp" "strconv" "strings" @@ -19,7 +18,7 @@ var ( ) func IsHTML(URL *models.URL) bool { - return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html") || strings.Contains(URL.GetMIMEType().String(), "html") + return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html") } func HTMLOutlinks(item *models.Item) (outlinks []*models.URL, err error) { @@ -103,7 +102,7 @@ func HTMLAssets(item *models.Item) (assets []*models.URL, err error) { document.Find("[data-item]").Each(func(index int, i *goquery.Selection) { dataItem, exists := i.Attr("data-item") if exists { - URLsFromJSON, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(dataItem))) + URLsFromJSON, err := GetURLsFromJSON([]byte(dataItem)) if err != nil { logger.Debug("unable to extract URLs from JSON in data-item attribute", "err", err, "url", item.GetURL().String(), "item", item.GetShortID()) } else { @@ -223,7 +222,7 @@ func HTMLAssets(item *models.Item) (assets []*models.URL, err error) { scriptType, exists := i.Attr("type") if exists { if scriptType == "application/json" { - URLsFromJSON, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(i.Text()))) + URLsFromJSON, err := GetURLsFromJSON([]byte(i.Text())) if err != nil { // TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed // c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL) @@ -282,7 +281,7 @@ func HTMLAssets(item *models.Item) (assets []*models.URL, err error) { } if len(jsonContent[1]) > payloadEndPosition { - URLsFromJSON, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(jsonContent[1][:payloadEndPosition+1]))) + URLsFromJSON, err := GetURLsFromJSON([]byte(jsonContent[1][:payloadEndPosition+1])) if err != nil { // TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed // c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL) diff --git a/internal/pkg/postprocessor/extractor/json.go b/internal/pkg/postprocessor/extractor/json.go index a112ab79..941702b9 100644 --- a/internal/pkg/postprocessor/extractor/json.go +++ b/internal/pkg/postprocessor/extractor/json.go @@ -2,20 +2,25 @@ package extractor import ( "encoding/json" - "strings" "github.com/ImVexed/fasturl" "github.com/internetarchive/Zeno/pkg/models" ) func IsJSON(URL *models.URL) bool { - return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json") || strings.Contains(URL.GetMIMEType().String(), "json") + return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json") } func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) { defer URL.RewindBody() - rawURLs, err := GetURLsFromJSON(json.NewDecoder(URL.GetBody())) + bodyBytes := make([]byte, URL.GetBody().Len()) + _, err = URL.GetBody().Read(bodyBytes) + if err != nil { + return nil, nil, err + } + + rawURLs, err := GetURLsFromJSON(bodyBytes) if err != nil { return nil, nil, err } @@ -36,9 +41,9 @@ func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) { return assets, outlinks, nil } -func GetURLsFromJSON(decoder *json.Decoder) ([]string, error) { +func GetURLsFromJSON(body []byte) ([]string, error) { var data interface{} - err := decoder.Decode(&data) + err := json.Unmarshal(body, &data) if err != nil { return nil, err } diff --git a/internal/pkg/postprocessor/extractor/s3.go b/internal/pkg/postprocessor/extractor/s3.go index 9098b9b5..431107d9 100644 --- a/internal/pkg/postprocessor/extractor/s3.go +++ b/internal/pkg/postprocessor/extractor/s3.go @@ -3,6 +3,7 @@ package extractor import ( "encoding/xml" "fmt" + "io" "net/url" "github.com/internetarchive/Zeno/internal/pkg/utils" @@ -36,7 +37,7 @@ type S3Object struct { } type CommonPrefix struct { - Prefix []string `xml:"Prefix"` + Prefix string `xml:"Prefix"` } // IsS3 checks if the response is from an S3 server @@ -44,107 +45,78 @@ func IsS3(URL *models.URL) bool { return utils.StringContainsSliceElements(URL.GetResponse().Header.Get("Server"), validS3Servers) } -// S3 decides which helper to call based on the query param: old style (no list-type=2) vs. new style (list-type=2) +// S3 takes an initial response and returns URLs of either files or prefixes at the current level, +// plus continuation URL if more results exist func S3(URL *models.URL) ([]*models.URL, error) { defer URL.RewindBody() - // Decode XML result + bodyBytes, err := io.ReadAll(URL.GetBody()) + if err != nil { + return nil, fmt.Errorf("error reading response body: %v", err) + } + var result S3ListBucketResult - if err := xml.NewDecoder(URL.GetBody()).Decode(&result); err != nil { - return nil, fmt.Errorf("error decoding S3 XML: %v", err) + if err := xml.Unmarshal(bodyBytes, &result); err != nil { + return nil, fmt.Errorf("error parsing XML: %v", err) } - // Prepare base data + // Extract base URL from the response URL reqURL := URL.GetRequest().URL - listType := reqURL.Query().Get("list-type") - - // Build https:// as the base for direct file links - baseStr := fmt.Sprintf("https://%s", reqURL.Host) - parsedBase, err := url.Parse(baseStr) + requestQuery := reqURL.Query() + baseURL := fmt.Sprintf("https://%s", reqURL.Host) + parsedBase, err := url.Parse(baseURL) if err != nil { return nil, fmt.Errorf("invalid base URL: %v", err) } - var outlinkStrings []string - - // Delegate to old style or new style - if listType != "2" { - // Old style S3 listing, uses marker - outlinkStrings = s3Legacy(reqURL, parsedBase, result) - } else { - // New style listing (list-type=2), uses continuation token and/or CommonPrefixes - outlinkStrings = s3V2(reqURL, parsedBase, result) - } - - // Convert from []string -> []*models.URL - var outlinks []*models.URL - for _, link := range outlinkStrings { - outlinks = append(outlinks, &models.URL{Raw: link}) - } - return outlinks, nil -} - -// s3Legacy handles the old ListObjects style, which uses `marker` for pagination. -func s3Legacy(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []string { - var outlinks []string + var URLs []string - // If there are objects in , create a "next page" URL using `marker` - if len(result.Contents) > 0 { - lastKey := result.Contents[len(result.Contents)-1].Key + // Ensure we can add marker + // ListObjects + if requestQuery.Get("list-type") != "2" && len(result.Contents) > 0 { + // If we can, iterate through S3 using the marker field nextURL := *reqURL q := nextURL.Query() - q.Set("marker", lastKey) + q.Set("marker", result.Contents[len(result.Contents)-1].Key) nextURL.RawQuery = q.Encode() - outlinks = append(outlinks, nextURL.String()) - } - - // Produce direct file links for each object - for _, obj := range result.Contents { - if obj.Size > 0 { - fileURL := *parsedBase - fileURL.Path += "/" + obj.Key - outlinks = append(outlinks, fileURL.String()) - } + URLs = append(URLs, nextURL.String()) } - return outlinks -} - -// s3V2 handles the new ListObjectsV2 style, which uses `continuation-token` and can return CommonPrefixes. -func s3V2(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []string { - var outlinks []string - - // If we have common prefixes => "subfolders" + // If we are using list-type 2/ListObjectsV2 if len(result.CommonPrefixes) > 0 { for _, prefix := range result.CommonPrefixes { - // Create a URL for each common prefix (subfolder) - for _, p := range prefix.Prefix { - nextURL := *reqURL - q := nextURL.Query() - q.Set("prefix", p) - nextURL.RawQuery = q.Encode() - outlinks = append(outlinks, nextURL.String()) - } + nextURL := *reqURL + q := nextURL.Query() + q.Set("prefix", prefix.Prefix) + nextURL.RawQuery = q.Encode() + URLs = append(URLs, nextURL.String()) } } else { - // Otherwise, we have actual objects in + // Otherwise return file URLs for _, obj := range result.Contents { if obj.Size > 0 { fileURL := *parsedBase fileURL.Path += "/" + obj.Key - outlinks = append(outlinks, fileURL.String()) + URLs = append(URLs, fileURL.String()) } } } - // If truncated => add a link with continuation-token + // If there's a continuation token, add the continuation URL if result.IsTruncated && result.NextContinuationToken != "" { nextURL := *reqURL q := nextURL.Query() q.Set("continuation-token", result.NextContinuationToken) nextURL.RawQuery = q.Encode() - outlinks = append(outlinks, nextURL.String()) + URLs = append(URLs, nextURL.String()) + } + + var outlinks []*models.URL + for _, extractedURL := range URLs { + outlinks = append(outlinks, &models.URL{ + Raw: extractedURL, + }) } - return outlinks + return outlinks, nil } diff --git a/internal/pkg/postprocessor/extractor/s3_test.go b/internal/pkg/postprocessor/extractor/s3_test.go deleted file mode 100644 index feda3d57..00000000 --- a/internal/pkg/postprocessor/extractor/s3_test.go +++ /dev/null @@ -1,170 +0,0 @@ -package extractor - -import ( - "net/http" - "net/url" - "os" - "strings" - "testing" - - "github.com/CorentinB/warc/pkg/spooledtempfile" - "github.com/internetarchive/Zeno/pkg/models" -) - -// TestIsS3 checks the Server header for known S3 strings. -func TestIsS3(t *testing.T) { - tests := []struct { - name string - server string - want bool - }{ - {"AmazonS3", "AmazonS3", true}, - {"WasabiS3", "WasabiS3", true}, - {"AliyunOSS", "AliyunOSS", true}, - {"No match", "Apache", false}, - {"Partial match", "Amazon", false}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - // Create a *models.URL with the response Server header set - URLObj := &models.URL{} - - URLObj.SetResponse(&http.Response{ - Header: http.Header{ - "Server": []string{tt.server}, - }, - }) - - got := IsS3(URLObj) - if got != tt.want { - t.Errorf("IsS3(server=%q) = %v, want %v", tt.server, got, tt.want) - } - }) - } -} - -func TestS3(t *testing.T) { - // This subtest shows a scenario of a valid XML with a single object, - // and list-type != 2 => "marker" logic should be used. - t.Run("Valid XML with single object, no list-type=2 => marker next link", func(t *testing.T) { - xmlBody := ` - - - file1.txt - 2021-01-01T12:00:00.000Z - 123 - - false -` - - // Build an http.Request with a query param that is NOT list-type=2 - reqURL, _ := url.Parse("https://example.com/?someparam=1") - - // Create your models.URL instance. - URLObj := &models.URL{} - URLObj.SetRequest(&http.Request{URL: reqURL}) - - // Likewise, set the HTTP response header using SetResponse. - // We want to simulate an S3 server for these tests. - URLObj.SetResponse(&http.Response{ - Header: http.Header{ - "Server": []string{"AmazonS3"}, - }, - }) - - spooledTempFile := spooledtempfile.NewSpooledTempFile("test", os.TempDir(), 2048, false, -1) - spooledTempFile.Write([]byte(xmlBody)) - - URLObj.SetBody(spooledTempFile) - - outlinks, err := S3(URLObj) - if err != nil { - t.Fatalf("S3() returned unexpected error: %v", err) - } - - if len(outlinks) != 2 { - t.Fatalf("expected 2 outlinks, got %d", len(outlinks)) - } - expectedOutlinks := []string{ - "https://example.com/?marker=file1.txt&someparam=1", - "https://example.com/file1.txt", - } - for i, outlink := range outlinks { - if outlink.Raw != expectedOutlinks[i] { - t.Errorf("expected %s, got %s", expectedOutlinks[i], outlink.Raw) - } - } - }) - - // Another subtest example: common prefixes => subfolder links for list-type=2 - t.Run("Valid XML with common prefixes => subfolder links (list-type=2)", func(t *testing.T) { - xmlBody := ` - - false - - folder1/ - folder2/ - -` - - reqURL, _ := url.Parse("https://example.com/?list-type=2") - - URLObj := &models.URL{} - URLObj.SetRequest(&http.Request{URL: reqURL}) - URLObj.SetResponse(&http.Response{ - Header: http.Header{ - "Server": []string{"AmazonS3"}, - }, - }) - - spooledTempFile := spooledtempfile.NewSpooledTempFile("test", os.TempDir(), 2048, false, -1) - spooledTempFile.Write([]byte(xmlBody)) - - URLObj.SetBody(spooledTempFile) - - outlinks, err := S3(URLObj) - if err != nil { - t.Fatalf("S3() returned unexpected error: %v", err) - } - - if len(outlinks) != 2 { - t.Fatalf("expected 2 outlinks, got %d", len(outlinks)) - } - if !strings.Contains(outlinks[0].Raw, "prefix=folder1%2F") { - t.Errorf("expected prefix=folder1/ in outlink, got %s", outlinks[0].Raw) - } - if !strings.Contains(outlinks[1].Raw, "prefix=folder2%2F") { - t.Errorf("expected prefix=folder2/ in outlink, got %s", outlinks[1].Raw) - } - }) - - // Example for invalid XML - t.Run("Invalid XML => error", func(t *testing.T) { - xmlBody := ` - if bytes.Contains(t, sitemapMarker) { - return true - } - case xml.Directive: - // - if bytes.Contains(t, sitemapMarker) { - return true - } - case xml.ProcInst: - // - // t.Target is string, t.Inst is []byte - if bytes.Contains(t.Inst, sitemapMarker) { - return true - } - - // --- ELEMENT tokens --- - case xml.StartElement: - // 1) Check element's namespace or local name - // e.g. - // t.Name.Space could be "http://www.sitemaps.org/schemas/sitemap/0.9" - // t.Name.Local might be "urlset" - // - // But in practice, many sitemap docs have the namespace in the default XMLNS, - // so we should also check attributes. - if strings.Contains(t.Name.Space, string(sitemapMarker)) { - return true - } - if strings.Contains(t.Name.Local, string(sitemapMarker)) { - return true - } - - // 2) Check attributes (common place for the sitemap XMLNS) - for _, attr := range t.Attr { - if strings.Contains(attr.Value, string(sitemapMarker)) { - return true - } - } - - case xml.EndElement: - // EndElement typically has no textual data, so nothing to check - continue - } + xmlBody, err := io.ReadAll(URL.GetBody()) + if err != nil { + return false } - return false + + return isContentType(URL.GetResponse().Header.Get("Content-Type"), "xml") && bytes.Contains(xmlBody, sitemapMarker) } func XML(URL *models.URL) (assets, outlinks []*models.URL, err error) { diff --git a/internal/pkg/postprocessor/extractor/xml_test.go b/internal/pkg/postprocessor/extractor/xml_test.go index 7cce0711..4896a5ff 100644 --- a/internal/pkg/postprocessor/extractor/xml_test.go +++ b/internal/pkg/postprocessor/extractor/xml_test.go @@ -4,12 +4,10 @@ import ( "bytes" "io" "net/http" - "net/url" "os" "strings" "testing" - "github.com/CorentinB/warc/pkg/spooledtempfile" "github.com/internetarchive/Zeno/internal/pkg/archiver" "github.com/internetarchive/Zeno/pkg/models" ) @@ -157,131 +155,3 @@ func TestXML(t *testing.T) { }) } } - -// TestIsSitemapXML covers multiple scenarios. -func TestIsSitemapXML(t *testing.T) { - tests := []struct { - name string - xmlData string - want bool - }{ - { - name: "Valid sitemap XML", - xmlData: ` - - - https://example.com/page1 - - `, - want: true, - }, - { - name: "Invalid sitemap XML", - xmlData: ` - - Not a sitemap - `, - want: false, - }, - { - name: "Sitemap XML with comment containing marker", - xmlData: ` - - - Not a sitemap - `, - want: true, - }, - { - name: "Sitemap XML with directive containing marker", - xmlData: ` - - - Not a sitemap - `, - want: true, - }, - { - name: "Sitemap XML with processing instruction containing marker", - xmlData: ` - - - Not a sitemap - `, - want: true, - }, - { - name: "Sitemap XML with nested elements containing marker", - xmlData: ` - - Not a sitemap - `, - want: true, - }, - { - name: "Sitemap XML with attributes containing marker", - xmlData: ` - - Not a sitemap - `, - want: true, - }, - { - name: "Empty XML content", - xmlData: ``, - want: false, - }, - { - name: "Large sitemap XML content", - xmlData: ` - ` + strings.Repeat(`https://example.com/page`, 1000) + ``, - want: true, - }, - { - name: "Sitemap XML with special characters in namespace", - xmlData: ` - - - https://example.com/page - - `, - want: true, - }, - { - name: "Sitemap XML with special characters in URLs", - xmlData: ` - - - https://example.com/page?param=1&other=2 - - `, - want: true, - }, - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - // Construct a minimal FakeURL with your test data as body - URLObj := &models.URL{} - URLObj.SetRequest(&http.Request{URL: &url.URL{Scheme: "http", Host: "example.com"}}) - - // Likewise, set the HTTP response header using SetResponse. - // We want to simulate an S3 server for these tests. - URLObj.SetResponse(&http.Response{ - Header: http.Header{ - "Server": []string{"AmazonS3"}, - }, - }) - - spooledTempFile := spooledtempfile.NewSpooledTempFile("test", os.TempDir(), 2048, false, -1) - spooledTempFile.Write([]byte(tc.xmlData)) - - URLObj.SetBody(spooledTempFile) - - got := IsSitemapXML(URLObj) - if got != tc.want { - t.Errorf("IsSitemapXML(%q) = %v, want %v", tc.xmlData, got, tc.want) - } - }) - } -} diff --git a/internal/pkg/postprocessor/item.go b/internal/pkg/postprocessor/item.go index d6a3702e..596f6c20 100644 --- a/internal/pkg/postprocessor/item.go +++ b/internal/pkg/postprocessor/item.go @@ -1,8 +1,6 @@ package postprocessor import ( - "strings" - "github.com/google/uuid" "github.com/internetarchive/Zeno/internal/pkg/config" "github.com/internetarchive/Zeno/internal/pkg/log" @@ -77,10 +75,6 @@ func postprocessItem(item *models.Item) []*models.Item { logger.Debug("item is a child and it's depth (without redirections) is more than 2", "item_id", item.GetShortID()) item.SetStatus(models.ItemCompleted) return outlinks - } else if !domainscrawl.Enabled() && (item.GetDepthWithoutRedirections() == 1 && strings.Contains(item.GetURL().GetMIMEType().String(), "html")) { - logger.Debug("HTML got extracted as asset, skipping", "item_id", item.GetShortID()) - item.SetStatus(models.ItemCompleted) - return outlinks } else if config.Get().DisableAssetsCapture && !domainscrawl.Enabled() { logger.Debug("assets capture and domains crawl are disabled", "item_id", item.GetShortID()) item.SetStatus(models.ItemCompleted)