Skip to content

Commit

Permalink
Domains crawl refactor (#188)
Browse files Browse the repository at this point in the history
* postprocess: add shouldExtractAssets function to hold the logic

* rename postprocess.go to item.go

* domainscrawl: add package with base functions and unit tests

* config: remove error return from compileRegexes as it was always returned nil and regexp.MustCompile will panicif a regexp can't compile

* cmd&config: handle domains crawl flag and passing of values

* archiver: rationalized ProcessBody arguments and adapted it to the changes of DomainsCrawl config element

* domainscrawl: add Enabled() func to check if the package is enabled (has received elements)

* domainscrawl: add more tests

* postprocessor: refactor the conditional logic for outlinks extraction with the new domainscrawl package

* cmd: change domains-crawl flag desc

* postprocessor: correct the completion of an item based on the depth without rediretion (assets of assets)

* domainscrawl: make Match() accept a string and parse the url with fasturl cause postprocessor will never have net/url.URL parsed urls to check against domainscrawl

* archiver: reverted processBody() changes

* hotfix pause package from a segfault
  • Loading branch information
equals215 authored Jan 28, 2025
1 parent fec60c0 commit 8b55ea8
Show file tree
Hide file tree
Showing 12 changed files with 490 additions and 22 deletions.
2 changes: 1 addition & 1 deletion cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
getCmd.PersistentFlags().StringSlice("domains-crawl", []string{}, "Naive domains, full URLs or regexp to match against any URL to determine hop behaviour for outlinks. If an outlink URL is matched it will be queued to crawl with a hop of 0. This flag helps crawling entire domains while doing non-focused crawls.")
getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, <link> HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
getCmd.PersistentFlags().StringSlice("exclude-host", []string{}, "Exclude a specific host from the crawl, note that it will not exclude the domain if it is encountered as an asset for another web page.")
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.23.5

require (
github.com/CorentinB/warc v0.8.65
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3
github.com/PuerkitoBio/goquery v1.10.1
github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc
Expand Down
8 changes: 8 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
github.com/CorentinB/warc v0.8.65 h1:nbmL8TBZVfzLK75K1heSOcEMRj9fQHSkaexVs578it8=
github.com/CorentinB/warc v0.8.65/go.mod h1:A9Ds2kT59j2Bzbe5pDZ925XmVODwq9fAlmSSS45SRlk=
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 h1:ClzzXMDDuUbWfNNZqGeYq4PnYOlwlOVIvSyNaIy0ykg=
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3/go.mod h1:we0YA5CsBbH5+/NUzC/AlMmxaDtWlXeNsqrwXjTzmzA=
github.com/PuerkitoBio/goquery v1.10.1 h1:Y8JGYUkXWTGRB6Ars3+j3kN0xg1YqqlwvdTV8WTFQcU=
github.com/PuerkitoBio/goquery v1.10.1/go.mod h1:IYiHrOMps66ag56LEH7QYDDupKXyo5A8qrjIx3ZtujY=
github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 h1:K54lYH7ZY/NHweMd9/R82dHaFelQQmwjEhUfwUqCqEk=
Expand All @@ -13,9 +15,11 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkY
github.com/cloudflare/circl v1.5.0 h1:hxIWksrX6XN5a1L2TI/h53AGPhNHoUBo+TD1ms9+pys=
github.com/cloudflare/circl v1.5.0/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs=
github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=
github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWnBO0y+TZaA=
github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg=
github.com/elastic/go-elasticsearch/v7 v7.17.10 h1:TCQ8i4PmIJuBunvBS6bwT2ybzVFxxUhhltAs3Gyu1yo=
Expand Down Expand Up @@ -97,6 +101,7 @@ github.com/philippgille/gokv/test v0.7.0 h1:0wBKnKaFZlSeHxLXcmUJqK//IQGUMeu+o8B8
github.com/philippgille/gokv/test v0.7.0/go.mod h1:TP/VzO/qAoi6njsfKnRpXKno0hRuzD5wsLnHhtUcVkY=
github.com/philippgille/gokv/util v0.7.0 h1:5avUK/a3aSj/aWjhHv4/FkqgMon2B7k2BqFgLcR+DYg=
github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJy5BkW+RKgwDHMo=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM=
Expand Down Expand Up @@ -126,8 +131,10 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI=
github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
Expand Down Expand Up @@ -236,6 +243,7 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
mvdan.cc/xurls/v2 v2.6.0 h1:3NTZpeTxYVWNSokW3MKeyVkz/j7uYXYiMtXRUfmjbgI=
Expand Down
3 changes: 2 additions & 1 deletion internal/pkg/archiver/archiver.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (
"github.com/internetarchive/Zeno/internal/pkg/config"
"github.com/internetarchive/Zeno/internal/pkg/controler/pause"
"github.com/internetarchive/Zeno/internal/pkg/log"
"github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
"github.com/internetarchive/Zeno/internal/pkg/stats"
"github.com/internetarchive/Zeno/pkg/models"
)
Expand Down Expand Up @@ -219,7 +220,7 @@ func archive(seed *models.Item) {
item.GetURL().SetResponse(resp)

// Process the body
err = ProcessBody(item.GetURL(), config.Get().DisableAssetsCapture, config.Get().DomainsCrawl, config.Get().MaxHops, config.Get().WARCTempDir)
err = ProcessBody(item.GetURL(), config.Get().DisableAssetsCapture, domainscrawl.Enabled(), config.Get().MaxHops, config.Get().WARCTempDir)
if err != nil {
logger.Error("unable to process body", "err", err.Error(), "item_id", item.GetShortID(), "seed_id", seed.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops())
item.SetStatus(models.ItemFailed)
Expand Down
20 changes: 13 additions & 7 deletions internal/pkg/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"sync"

"github.com/google/uuid"
"github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
"github.com/internetarchive/Zeno/internal/pkg/utils"
"github.com/spf13/pflag"
"github.com/spf13/viper"
Expand Down Expand Up @@ -70,7 +71,7 @@ type Config struct {
JSON bool `mapstructure:"json"`
API bool `mapstructure:"api"`
Prometheus bool `mapstructure:"prometheus"`
DomainsCrawl bool `mapstructure:"domains-crawl"`
DomainsCrawl []string `mapstructure:"domains-crawl"`
CaptureAlternatePages bool `mapstructure:"capture-alternate-pages"`
DisableLocalDedupe bool `mapstructure:"disable-local-dedupe"`
CertValidation bool `mapstructure:"cert-validation"`
Expand Down Expand Up @@ -273,19 +274,24 @@ func GenerateCrawlConfig() error {
}

slog.Info("Compiling exclusion regexes", "regexes", len(regexes))
compiledRegexes, err := compileRegexes(regexes)
if err != nil {
return err
}
compiledRegexes := compileRegexes(regexes)

config.ExclusionRegexes = append(config.ExclusionRegexes, compiledRegexes...)
}
}

if len(config.DomainsCrawl) > 0 {
slog.Info("Domains crawl enabled", "domains/regex", config.DomainsCrawl)
err := domainscrawl.AddElements(config.DomainsCrawl)
if err != nil {
panic(err)
}
}

return nil
}

func compileRegexes(regexes []string) ([]*regexp.Regexp, error) {
func compileRegexes(regexes []string) []*regexp.Regexp {
var compiledRegexes []*regexp.Regexp

for _, regex := range regexes {
Expand All @@ -295,7 +301,7 @@ func compileRegexes(regexes []string) ([]*regexp.Regexp, error) {
compiledRegexes = append(compiledRegexes, compiledRegex)
}

return compiledRegexes, nil
return compiledRegexes
}

func readLocalExclusionFile(file string) (regexes []string, err error) {
Expand Down
5 changes: 5 additions & 0 deletions internal/pkg/controler/pause/pause.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,11 @@ func Pause(message ...string) {
if !swap {
return
}

if len(message) == 0 {
message = append(message, "Paused")
}

manager.message = message[0]

manager.subscribers.Range(func(key, _ interface{}) bool {
Expand Down
9 changes: 9 additions & 0 deletions internal/pkg/controler/pause/pause_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,12 @@ import (
"sync/atomic"
"testing"
"time"

"github.com/internetarchive/Zeno/internal/pkg/stats"
)

func TestBasicPauseResume(t *testing.T) {
stats.Init()
manager = &pauseManager{}

var wg sync.WaitGroup
Expand Down Expand Up @@ -70,6 +73,7 @@ func TestBasicPauseResume(t *testing.T) {
}

func TestMultipleSubscribers(t *testing.T) {
stats.Init()
manager = &pauseManager{}
const numSubscribers = 10
var wg sync.WaitGroup
Expand Down Expand Up @@ -144,6 +148,7 @@ func TestMultipleSubscribers(t *testing.T) {
}

func TestSubscriberUnsubscribeDuringPause(t *testing.T) {
stats.Init()
manager = &pauseManager{}
var wg sync.WaitGroup
wg.Add(1)
Expand Down Expand Up @@ -194,6 +199,7 @@ func TestSubscriberUnsubscribeDuringPause(t *testing.T) {
}

func TestConcurrentPauseResume(t *testing.T) {
stats.Init()
manager = &pauseManager{}
const numSubscribers = 5
const numCycles = 10
Expand Down Expand Up @@ -287,6 +293,7 @@ func TestConcurrentPauseResume(t *testing.T) {
}

func TestPauseResumeWithUnsubscribe(t *testing.T) {
stats.Init()
manager = &pauseManager{}
var wg sync.WaitGroup
wg.Add(1)
Expand Down Expand Up @@ -348,6 +355,7 @@ func TestPauseResumeWithUnsubscribe(t *testing.T) {
}

func TestNoSubscribers(t *testing.T) {
stats.Init()
manager = &pauseManager{}
// Call Pause() and Resume() when there are no subscribers.
// If no panic occurs, the test passes.
Expand All @@ -356,6 +364,7 @@ func TestNoSubscribers(t *testing.T) {
}

func TestPauseResumeE2E(t *testing.T) {
stats.Init()
manager = &pauseManager{}
var workCounter int32 // Counts the amount of work done.
var wg sync.WaitGroup
Expand Down
5 changes: 5 additions & 0 deletions internal/pkg/postprocessor/assets.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package postprocessor

import (
"github.com/internetarchive/Zeno/internal/pkg/config"
"github.com/internetarchive/Zeno/internal/pkg/log"
"github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor"
"github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/ina"
Expand Down Expand Up @@ -76,3 +77,7 @@ func extractAssets(item *models.Item) (assets []*models.URL, err error) {

return assets, nil
}

func shouldExtractAssets(item *models.Item) bool {
return !config.Get().DisableAssetsCapture && item.GetURL().GetBody() != nil
}
145 changes: 145 additions & 0 deletions internal/pkg/postprocessor/domainscrawl/domainscrawl.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
// Package domainscrawl is a postprocessing component that parse domains from a given input and stores them for later matching.
// It can store naive domains, full URLs, and regex patterns. It can then check if a given URL matches any of the stored patterns.
package domainscrawl

import (
"net/url"
"regexp"
"strings"
"sync"

"github.com/ImVexed/fasturl"
)

type matchEngine struct {
sync.RWMutex
enabled bool
regexes []*regexp.Regexp
domains []string
urls []url.URL
}

var (
globalMatcher = &matchEngine{
enabled: false,
regexes: make([]*regexp.Regexp, 0),
domains: make([]string, 0),
urls: make([]url.URL, 0),
}
)

// Reset the matcher to its initial state
func Reset() {
globalMatcher.Lock()
defer globalMatcher.Unlock()

globalMatcher.enabled = false
globalMatcher.regexes = make([]*regexp.Regexp, 0)
globalMatcher.domains = make([]string, 0)
globalMatcher.urls = make([]url.URL, 0)
}

// Enabled returns true if the domainscrawl matcher is enabled
func Enabled() bool {
globalMatcher.RLock()
defer globalMatcher.RUnlock()

return globalMatcher.enabled
}

// AddElements takes a slice of strings, heuristically determines their type, and stores them
func AddElements(elements []string) error {
globalMatcher.Lock()
defer globalMatcher.Unlock()

if !globalMatcher.enabled {
globalMatcher.enabled = true
}

for _, element := range elements {
// Try to parse as a URL first
parsedURL, err := url.Parse(element)
if err == nil && parsedURL.Scheme != "" && parsedURL.Host != "" {
// If it has a scheme and host, it's a full URL
globalMatcher.urls = append(globalMatcher.urls, *parsedURL)
continue
}

// Check if it's a naive domain (e.g., "example.com")
if isNaiveDomain(element) {
globalMatcher.domains = append(globalMatcher.domains, element)
continue
}

// Otherwise, assume it's a regex
re, err := regexp.Compile(element)
if err != nil {
return err
}
globalMatcher.regexes = append(globalMatcher.regexes, re)
}
return nil
}

// Match checks if a given URL matches any of the stored patterns
func Match(rawURL string) bool {
u, err := fasturl.ParseURL(rawURL)
if err != nil {
return false
}

globalMatcher.RLock()
defer globalMatcher.RUnlock()

// Check against naive domains
for _, domain := range globalMatcher.domains {
if isSubdomainOrExactMatch(u.Host, domain) {
return true
}
}

// Check against full URLs
for _, storedURL := range globalMatcher.urls {
if storedURL.String() == rawURL {
return true
}
// If the stored URL has no query, path, or fragment, we greedily match (sub)domain
if storedURL.RawQuery == "" && storedURL.Path == "" && storedURL.Fragment == "" && isSubdomainOrExactMatch(u.Host, storedURL.Host) {
return true
}
}

// Check against regex patterns
for _, re := range globalMatcher.regexes {
if re.MatchString(rawURL) {
return true
}
}

return false
}

// Check if a string is a naive domain (e.g., "example.com")
func isNaiveDomain(s string) bool {
// A naive domain should not contain a scheme, path, or query
if strings.Contains(s, "://") || strings.Contains(s, "/") || strings.Contains(s, "?") || strings.Contains(s, "#") {
return false
}
// Check if it has a dot and no spaces
return strings.Contains(s, ".") && !strings.Contains(s, " ")
}

// isSubdomainOrExactMatch checks if the given host is a subdomain or an exact match of the domain
func isSubdomainOrExactMatch(host, domain string) bool {
// Exact match
if host == domain {
return true
}

// Subdomain match (e.g., "sub.example.com" matches "example.com")
if strings.HasSuffix(host, "."+domain) {
return true
}

return false
}
Loading

0 comments on commit 8b55ea8

Please sign in to comment.