Skip to content

Commit

Permalink
Revert "Stream based extractors" (#194)
Browse files Browse the repository at this point in the history
* Revert "add: use Zeno's User-Agent and custom HTTP client when requesting exclusion file"

This reverts commit ddb2e89.

* Revert "fix: avoid 2-layer assets extraction when HTML is wrongly discovered as asset"

This reverts commit 82cd8ad.

* Revert "add: streaming postprocessing for XML"

This reverts commit d8c8ba5.

* Revert "add: TestS3 + S3 extraction refactoring"

This reverts commit 632fafc.

* Revert "add: TestIsS3"

This reverts commit 59be6fa.

* Revert "add: streaming postprocessing for S3"

This reverts commit 7c601d3.

* Revert "add: streaming postprocessing for JSON"

This reverts commit bd12395.
  • Loading branch information
equals215 authored Feb 2, 2025
1 parent ddb2e89 commit 43c8d63
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 465 deletions.
14 changes: 1 addition & 13 deletions internal/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import (
"regexp"
"strings"
"sync"
"time"

"github.com/google/uuid"
"github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
Expand Down Expand Up @@ -326,18 +325,7 @@ func readLocalExclusionFile(file string) (regexes []string, err error) {
}

func readRemoteExclusionFile(URL string) (regexes []string, err error) {
httpClient := &http.Client{
Timeout: time.Second * 5,
}

req, err := http.NewRequest(http.MethodGet, URL, nil)
if err != nil {
return regexes, err
}

req.Header.Set("User-Agent", config.UserAgent)

resp, err := httpClient.Do(req)
resp, err := http.Get(URL)
if err != nil {
return regexes, err
}
Expand Down
9 changes: 4 additions & 5 deletions internal/pkg/postprocessor/extractor/html.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package extractor

import (
"encoding/json"
"regexp"
"strconv"
"strings"
Expand All @@ -19,7 +18,7 @@ var (
)

func IsHTML(URL *models.URL) bool {
return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html") || strings.Contains(URL.GetMIMEType().String(), "html")
return isContentType(URL.GetResponse().Header.Get("Content-Type"), "html")
}

func HTMLOutlinks(item *models.Item) (outlinks []*models.URL, err error) {
Expand Down Expand Up @@ -103,7 +102,7 @@ func HTMLAssets(item *models.Item) (assets []*models.URL, err error) {
document.Find("[data-item]").Each(func(index int, i *goquery.Selection) {
dataItem, exists := i.Attr("data-item")
if exists {
URLsFromJSON, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(dataItem)))
URLsFromJSON, err := GetURLsFromJSON([]byte(dataItem))
if err != nil {
logger.Debug("unable to extract URLs from JSON in data-item attribute", "err", err, "url", item.GetURL().String(), "item", item.GetShortID())
} else {
Expand Down Expand Up @@ -223,7 +222,7 @@ func HTMLAssets(item *models.Item) (assets []*models.URL, err error) {
scriptType, exists := i.Attr("type")
if exists {
if scriptType == "application/json" {
URLsFromJSON, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(i.Text())))
URLsFromJSON, err := GetURLsFromJSON([]byte(i.Text()))
if err != nil {
// TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed
// c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
Expand Down Expand Up @@ -282,7 +281,7 @@ func HTMLAssets(item *models.Item) (assets []*models.URL, err error) {
}

if len(jsonContent[1]) > payloadEndPosition {
URLsFromJSON, err := GetURLsFromJSON(json.NewDecoder(strings.NewReader(jsonContent[1][:payloadEndPosition+1])))
URLsFromJSON, err := GetURLsFromJSON([]byte(jsonContent[1][:payloadEndPosition+1]))
if err != nil {
// TODO: maybe add back when https://github.com/internetarchive/Zeno/issues/147 is fixed
// c.Log.Debug("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
Expand Down
15 changes: 10 additions & 5 deletions internal/pkg/postprocessor/extractor/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,25 @@ package extractor

import (
"encoding/json"
"strings"

"github.com/ImVexed/fasturl"
"github.com/internetarchive/Zeno/pkg/models"
)

func IsJSON(URL *models.URL) bool {
return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json") || strings.Contains(URL.GetMIMEType().String(), "json")
return isContentType(URL.GetResponse().Header.Get("Content-Type"), "json")
}

func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) {
defer URL.RewindBody()

rawURLs, err := GetURLsFromJSON(json.NewDecoder(URL.GetBody()))
bodyBytes := make([]byte, URL.GetBody().Len())
_, err = URL.GetBody().Read(bodyBytes)
if err != nil {
return nil, nil, err
}

rawURLs, err := GetURLsFromJSON(bodyBytes)
if err != nil {
return nil, nil, err
}
Expand All @@ -36,9 +41,9 @@ func JSON(URL *models.URL) (assets, outlinks []*models.URL, err error) {
return assets, outlinks, nil
}

func GetURLsFromJSON(decoder *json.Decoder) ([]string, error) {
func GetURLsFromJSON(body []byte) ([]string, error) {
var data interface{}
err := decoder.Decode(&data)
err := json.Unmarshal(body, &data)
if err != nil {
return nil, err
}
Expand Down
108 changes: 40 additions & 68 deletions internal/pkg/postprocessor/extractor/s3.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package extractor
import (
"encoding/xml"
"fmt"
"io"
"net/url"

"github.com/internetarchive/Zeno/internal/pkg/utils"
Expand Down Expand Up @@ -36,115 +37,86 @@ type S3Object struct {
}

type CommonPrefix struct {
Prefix []string `xml:"Prefix"`
Prefix string `xml:"Prefix"`
}

// IsS3 checks if the response is from an S3 server
func IsS3(URL *models.URL) bool {
return utils.StringContainsSliceElements(URL.GetResponse().Header.Get("Server"), validS3Servers)
}

// S3 decides which helper to call based on the query param: old style (no list-type=2) vs. new style (list-type=2)
// S3 takes an initial response and returns URLs of either files or prefixes at the current level,
// plus continuation URL if more results exist
func S3(URL *models.URL) ([]*models.URL, error) {
defer URL.RewindBody()

// Decode XML result
bodyBytes, err := io.ReadAll(URL.GetBody())
if err != nil {
return nil, fmt.Errorf("error reading response body: %v", err)
}

var result S3ListBucketResult
if err := xml.NewDecoder(URL.GetBody()).Decode(&result); err != nil {
return nil, fmt.Errorf("error decoding S3 XML: %v", err)
if err := xml.Unmarshal(bodyBytes, &result); err != nil {
return nil, fmt.Errorf("error parsing XML: %v", err)
}

// Prepare base data
// Extract base URL from the response URL
reqURL := URL.GetRequest().URL
listType := reqURL.Query().Get("list-type")

// Build https://<host> as the base for direct file links
baseStr := fmt.Sprintf("https://%s", reqURL.Host)
parsedBase, err := url.Parse(baseStr)
requestQuery := reqURL.Query()
baseURL := fmt.Sprintf("https://%s", reqURL.Host)
parsedBase, err := url.Parse(baseURL)
if err != nil {
return nil, fmt.Errorf("invalid base URL: %v", err)
}

var outlinkStrings []string

// Delegate to old style or new style
if listType != "2" {
// Old style S3 listing, uses marker
outlinkStrings = s3Legacy(reqURL, parsedBase, result)
} else {
// New style listing (list-type=2), uses continuation token and/or CommonPrefixes
outlinkStrings = s3V2(reqURL, parsedBase, result)
}

// Convert from []string -> []*models.URL
var outlinks []*models.URL
for _, link := range outlinkStrings {
outlinks = append(outlinks, &models.URL{Raw: link})
}
return outlinks, nil
}

// s3Legacy handles the old ListObjects style, which uses `marker` for pagination.
func s3Legacy(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []string {
var outlinks []string
var URLs []string

// If there are objects in <Contents>, create a "next page" URL using `marker`
if len(result.Contents) > 0 {
lastKey := result.Contents[len(result.Contents)-1].Key
// Ensure we can add marker
// ListObjects
if requestQuery.Get("list-type") != "2" && len(result.Contents) > 0 {
// If we can, iterate through S3 using the marker field
nextURL := *reqURL
q := nextURL.Query()
q.Set("marker", lastKey)
q.Set("marker", result.Contents[len(result.Contents)-1].Key)
nextURL.RawQuery = q.Encode()
outlinks = append(outlinks, nextURL.String())
}

// Produce direct file links for each object
for _, obj := range result.Contents {
if obj.Size > 0 {
fileURL := *parsedBase
fileURL.Path += "/" + obj.Key
outlinks = append(outlinks, fileURL.String())
}
URLs = append(URLs, nextURL.String())
}

return outlinks
}

// s3V2 handles the new ListObjectsV2 style, which uses `continuation-token` and can return CommonPrefixes.
func s3V2(reqURL *url.URL, parsedBase *url.URL, result S3ListBucketResult) []string {
var outlinks []string

// If we have common prefixes => "subfolders"
// If we are using list-type 2/ListObjectsV2
if len(result.CommonPrefixes) > 0 {
for _, prefix := range result.CommonPrefixes {
// Create a URL for each common prefix (subfolder)
for _, p := range prefix.Prefix {
nextURL := *reqURL
q := nextURL.Query()
q.Set("prefix", p)
nextURL.RawQuery = q.Encode()
outlinks = append(outlinks, nextURL.String())
}
nextURL := *reqURL
q := nextURL.Query()
q.Set("prefix", prefix.Prefix)
nextURL.RawQuery = q.Encode()
URLs = append(URLs, nextURL.String())
}
} else {
// Otherwise, we have actual objects in <Contents>
// Otherwise return file URLs
for _, obj := range result.Contents {
if obj.Size > 0 {
fileURL := *parsedBase
fileURL.Path += "/" + obj.Key
outlinks = append(outlinks, fileURL.String())
URLs = append(URLs, fileURL.String())
}
}
}

// If truncated => add a link with continuation-token
// If there's a continuation token, add the continuation URL
if result.IsTruncated && result.NextContinuationToken != "" {
nextURL := *reqURL
q := nextURL.Query()
q.Set("continuation-token", result.NextContinuationToken)
nextURL.RawQuery = q.Encode()
outlinks = append(outlinks, nextURL.String())
URLs = append(URLs, nextURL.String())
}

var outlinks []*models.URL
for _, extractedURL := range URLs {
outlinks = append(outlinks, &models.URL{
Raw: extractedURL,
})
}

return outlinks
return outlinks, nil
}
Loading

0 comments on commit 43c8d63

Please sign in to comment.