Domains crawl refactor (#188)

* postprocess: add shouldExtractAssets function to hold the logic * rename postprocess.go to item.go * domainscrawl: add package with base functions and unit tests * config: remove error return from compileRegexes as it was always returned nil and regexp.MustCompile will panicif a regexp can't compile * cmd&config: handle domains crawl flag and passing of values * archiver: rationalized ProcessBody arguments and adapted it to the changes of DomainsCrawl config element * domainscrawl: add Enabled() func to check if the package is enabled (has received elements) * domainscrawl: add more tests * postprocessor: refactor the conditional logic for outlinks extraction with the new domainscrawl package * cmd: change domains-crawl flag desc * postprocessor: correct the completion of an item based on the depth without rediretion (assets of assets) * domainscrawl: make Match() accept a string and parse the url with fasturl cause postprocessor will never have net/url.URL parsed urls to check against domainscrawl * archiver: reverted processBody() changes * hotfix pause package from a segfault
internetarchive · Jan 28, 2025 · 8b55ea8 · 8b55ea8
1 parent fec60c0
commit 8b55ea8
Show file tree

Hide file tree

Showing 12 changed files with 490 additions and 22 deletions.
diff --git a/cmd/get.go b/cmd/get.go
@@ -42,7 +42,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
 	getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
 	getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
 	getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
-	getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
+	getCmd.PersistentFlags().StringSlice("domains-crawl", []string{}, "Naive domains, full URLs or regexp to match against any URL to determine hop behaviour for outlinks. If an outlink URL is matched it will be queued to crawl with a hop of 0. This flag helps crawling entire domains while doing non-focused crawls.")
 	getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
 	getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, <link> HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
 	getCmd.PersistentFlags().StringSlice("exclude-host", []string{}, "Exclude a specific host from the crawl, note that it will not exclude the domain if it is encountered as an asset for another web page.")

diff --git a/go.mod b/go.mod
@@ -4,6 +4,7 @@ go 1.23.5
 
 require (
 	github.com/CorentinB/warc v0.8.65
+	github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3
 	github.com/PuerkitoBio/goquery v1.10.1
 	github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1
 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc

diff --git a/go.sum b/go.sum
@@ -1,5 +1,7 @@
 github.com/CorentinB/warc v0.8.65 h1:nbmL8TBZVfzLK75K1heSOcEMRj9fQHSkaexVs578it8=
 github.com/CorentinB/warc v0.8.65/go.mod h1:A9Ds2kT59j2Bzbe5pDZ925XmVODwq9fAlmSSS45SRlk=
+github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 h1:ClzzXMDDuUbWfNNZqGeYq4PnYOlwlOVIvSyNaIy0ykg=
+github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3/go.mod h1:we0YA5CsBbH5+/NUzC/AlMmxaDtWlXeNsqrwXjTzmzA=
 github.com/PuerkitoBio/goquery v1.10.1 h1:Y8JGYUkXWTGRB6Ars3+j3kN0xg1YqqlwvdTV8WTFQcU=
 github.com/PuerkitoBio/goquery v1.10.1/go.mod h1:IYiHrOMps66ag56LEH7QYDDupKXyo5A8qrjIx3ZtujY=
 github.com/ada-url/goada v0.0.0-20250104020233-00cbf4dc9da1 h1:K54lYH7ZY/NHweMd9/R82dHaFelQQmwjEhUfwUqCqEk=
@@ -13,9 +15,11 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkY
 github.com/cloudflare/circl v1.5.0 h1:hxIWksrX6XN5a1L2TI/h53AGPhNHoUBo+TD1ms9+pys=
 github.com/cloudflare/circl v1.5.0/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs=
 github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM=
 github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=
 github.com/elastic/go-elasticsearch v0.0.0 h1:Pd5fqOuBxKxv83b0+xOAJDAkziWYwFinWnBO0y+TZaA=
 github.com/elastic/go-elasticsearch v0.0.0/go.mod h1:TkBSJBuTyFdBnrNqoPc54FN0vKf5c04IdM4zuStJ7xg=
 github.com/elastic/go-elasticsearch/v7 v7.17.10 h1:TCQ8i4PmIJuBunvBS6bwT2ybzVFxxUhhltAs3Gyu1yo=
@@ -97,6 +101,7 @@ github.com/philippgille/gokv/test v0.7.0 h1:0wBKnKaFZlSeHxLXcmUJqK//IQGUMeu+o8B8
 github.com/philippgille/gokv/test v0.7.0/go.mod h1:TP/VzO/qAoi6njsfKnRpXKno0hRuzD5wsLnHhtUcVkY=
 github.com/philippgille/gokv/util v0.7.0 h1:5avUK/a3aSj/aWjhHv4/FkqgMon2B7k2BqFgLcR+DYg=
 github.com/philippgille/gokv/util v0.7.0/go.mod h1:i9KLHbPxGiHLMhkix/CcDQhpPbCkJy5BkW+RKgwDHMo=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM=
@@ -126,8 +131,10 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA=
 github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI=
 github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
 github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
 github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8=
@@ -236,6 +243,7 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 mvdan.cc/xurls/v2 v2.6.0 h1:3NTZpeTxYVWNSokW3MKeyVkz/j7uYXYiMtXRUfmjbgI=

diff --git a/internal/pkg/archiver/archiver.go b/internal/pkg/archiver/archiver.go
@@ -13,6 +13,7 @@ import (
 	"github.com/internetarchive/Zeno/internal/pkg/config"
 	"github.com/internetarchive/Zeno/internal/pkg/controler/pause"
 	"github.com/internetarchive/Zeno/internal/pkg/log"
+	"github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
 	"github.com/internetarchive/Zeno/internal/pkg/stats"
 	"github.com/internetarchive/Zeno/pkg/models"
 )
@@ -219,7 +220,7 @@ func archive(seed *models.Item) {
 			item.GetURL().SetResponse(resp)
 
 			// Process the body
-			err = ProcessBody(item.GetURL(), config.Get().DisableAssetsCapture, config.Get().DomainsCrawl, config.Get().MaxHops, config.Get().WARCTempDir)
+			err = ProcessBody(item.GetURL(), config.Get().DisableAssetsCapture, domainscrawl.Enabled(), config.Get().MaxHops, config.Get().WARCTempDir)
 			if err != nil {
 				logger.Error("unable to process body", "err", err.Error(), "item_id", item.GetShortID(), "seed_id", seed.GetShortID(), "depth", item.GetDepth(), "hops", item.GetURL().GetHops())
 				item.SetStatus(models.ItemFailed)

diff --git a/internal/pkg/config/config.go b/internal/pkg/config/config.go
@@ -14,6 +14,7 @@ import (
 	"sync"
 
 	"github.com/google/uuid"
+	"github.com/internetarchive/Zeno/internal/pkg/postprocessor/domainscrawl"
 	"github.com/internetarchive/Zeno/internal/pkg/utils"
 	"github.com/spf13/pflag"
 	"github.com/spf13/viper"
@@ -70,7 +71,7 @@ type Config struct {
 	JSON                  bool     `mapstructure:"json"`
 	API                   bool     `mapstructure:"api"`
 	Prometheus            bool     `mapstructure:"prometheus"`
-	DomainsCrawl          bool     `mapstructure:"domains-crawl"`
+	DomainsCrawl          []string `mapstructure:"domains-crawl"`
 	CaptureAlternatePages bool     `mapstructure:"capture-alternate-pages"`
 	DisableLocalDedupe    bool     `mapstructure:"disable-local-dedupe"`
 	CertValidation        bool     `mapstructure:"cert-validation"`
@@ -273,19 +274,24 @@ func GenerateCrawlConfig() error {
 			}
 
 			slog.Info("Compiling exclusion regexes", "regexes", len(regexes))
-			compiledRegexes, err := compileRegexes(regexes)
-			if err != nil {
-				return err
-			}
+			compiledRegexes := compileRegexes(regexes)
 
 			config.ExclusionRegexes = append(config.ExclusionRegexes, compiledRegexes...)
 		}
 	}
 
+	if len(config.DomainsCrawl) > 0 {
+		slog.Info("Domains crawl enabled", "domains/regex", config.DomainsCrawl)
+		err := domainscrawl.AddElements(config.DomainsCrawl)
+		if err != nil {
+			panic(err)
+		}
+	}
+
 	return nil
 }
 
-func compileRegexes(regexes []string) ([]*regexp.Regexp, error) {
+func compileRegexes(regexes []string) []*regexp.Regexp {
 	var compiledRegexes []*regexp.Regexp
 
 	for _, regex := range regexes {
@@ -295,7 +301,7 @@ func compileRegexes(regexes []string) ([]*regexp.Regexp, error) {
 		compiledRegexes = append(compiledRegexes, compiledRegex)
 	}
 
-	return compiledRegexes, nil
+	return compiledRegexes
 }
 
 func readLocalExclusionFile(file string) (regexes []string, err error) {

diff --git a/internal/pkg/controler/pause/pause.go b/internal/pkg/controler/pause/pause.go
@@ -47,6 +47,11 @@ func Pause(message ...string) {
 	if !swap {
 		return
 	}
+
+	if len(message) == 0 {
+		message = append(message, "Paused")
+	}
+
 	manager.message = message[0]
 
 	manager.subscribers.Range(func(key, _ interface{}) bool {

diff --git a/internal/pkg/controler/pause/pause_test.go b/internal/pkg/controler/pause/pause_test.go
@@ -6,9 +6,12 @@ import (
 	"sync/atomic"
 	"testing"
 	"time"
+
+	"github.com/internetarchive/Zeno/internal/pkg/stats"
 )
 
 func TestBasicPauseResume(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 
 	var wg sync.WaitGroup
@@ -70,6 +73,7 @@ func TestBasicPauseResume(t *testing.T) {
 }
 
 func TestMultipleSubscribers(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 	const numSubscribers = 10
 	var wg sync.WaitGroup
@@ -144,6 +148,7 @@ func TestMultipleSubscribers(t *testing.T) {
 }
 
 func TestSubscriberUnsubscribeDuringPause(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 	var wg sync.WaitGroup
 	wg.Add(1)
@@ -194,6 +199,7 @@ func TestSubscriberUnsubscribeDuringPause(t *testing.T) {
 }
 
 func TestConcurrentPauseResume(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 	const numSubscribers = 5
 	const numCycles = 10
@@ -287,6 +293,7 @@ func TestConcurrentPauseResume(t *testing.T) {
 }
 
 func TestPauseResumeWithUnsubscribe(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 	var wg sync.WaitGroup
 	wg.Add(1)
@@ -348,6 +355,7 @@ func TestPauseResumeWithUnsubscribe(t *testing.T) {
 }
 
 func TestNoSubscribers(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 	// Call Pause() and Resume() when there are no subscribers.
 	// If no panic occurs, the test passes.
@@ -356,6 +364,7 @@ func TestNoSubscribers(t *testing.T) {
 }
 
 func TestPauseResumeE2E(t *testing.T) {
+	stats.Init()
 	manager = &pauseManager{}
 	var workCounter int32 // Counts the amount of work done.
 	var wg sync.WaitGroup

diff --git a/internal/pkg/postprocessor/assets.go b/internal/pkg/postprocessor/assets.go
@@ -1,6 +1,7 @@
 package postprocessor
 
 import (
+	"github.com/internetarchive/Zeno/internal/pkg/config"
 	"github.com/internetarchive/Zeno/internal/pkg/log"
 	"github.com/internetarchive/Zeno/internal/pkg/postprocessor/extractor"
 	"github.com/internetarchive/Zeno/internal/pkg/postprocessor/sitespecific/ina"
@@ -76,3 +77,7 @@ func extractAssets(item *models.Item) (assets []*models.URL, err error) {
 
 	return assets, nil
 }
+
+func shouldExtractAssets(item *models.Item) bool {
+	return !config.Get().DisableAssetsCapture && item.GetURL().GetBody() != nil
+}
diff --git a/internal/pkg/postprocessor/domainscrawl/domainscrawl.go b/internal/pkg/postprocessor/domainscrawl/domainscrawl.go
@@ -0,0 +1,145 @@
+// Package domainscrawl is a postprocessing component that parse domains from a given input and stores them for later matching.
+// It can store naive domains, full URLs, and regex patterns. It can then check if a given URL matches any of the stored patterns.
+package domainscrawl
+
+import (
+	"net/url"
+	"regexp"
+	"strings"
+	"sync"
+
+	"github.com/ImVexed/fasturl"
+)
+
+type matchEngine struct {
+	sync.RWMutex
+	enabled bool
+	regexes []*regexp.Regexp
+	domains []string
+	urls    []url.URL
+}
+
+var (
+	globalMatcher = &matchEngine{
+		enabled: false,
+		regexes: make([]*regexp.Regexp, 0),
+		domains: make([]string, 0),
+		urls:    make([]url.URL, 0),
+	}
+)
+
+// Reset the matcher to its initial state
+func Reset() {
+	globalMatcher.Lock()
+	defer globalMatcher.Unlock()
+
+	globalMatcher.enabled = false
+	globalMatcher.regexes = make([]*regexp.Regexp, 0)
+	globalMatcher.domains = make([]string, 0)
+	globalMatcher.urls = make([]url.URL, 0)
+}
+
+// Enabled returns true if the domainscrawl matcher is enabled
+func Enabled() bool {
+	globalMatcher.RLock()
+	defer globalMatcher.RUnlock()
+
+	return globalMatcher.enabled
+}
+
+// AddElements takes a slice of strings, heuristically determines their type, and stores them
+func AddElements(elements []string) error {
+	globalMatcher.Lock()
+	defer globalMatcher.Unlock()
+
+	if !globalMatcher.enabled {
+		globalMatcher.enabled = true
+	}
+
+	for _, element := range elements {
+		// Try to parse as a URL first
+		parsedURL, err := url.Parse(element)
+		if err == nil && parsedURL.Scheme != "" && parsedURL.Host != "" {
+			// If it has a scheme and host, it's a full URL
+			globalMatcher.urls = append(globalMatcher.urls, *parsedURL)
+			continue
+		}
+
+		// Check if it's a naive domain (e.g., "example.com")
+		if isNaiveDomain(element) {
+			globalMatcher.domains = append(globalMatcher.domains, element)
+			continue
+		}
+
+		// Otherwise, assume it's a regex
+		re, err := regexp.Compile(element)
+		if err != nil {
+			return err
+		}
+		globalMatcher.regexes = append(globalMatcher.regexes, re)
+	}
+	return nil
+}
+
+// Match checks if a given URL matches any of the stored patterns
+func Match(rawURL string) bool {
+	u, err := fasturl.ParseURL(rawURL)
+	if err != nil {
+		return false
+	}
+
+	globalMatcher.RLock()
+	defer globalMatcher.RUnlock()
+
+	// Check against naive domains
+	for _, domain := range globalMatcher.domains {
+		if isSubdomainOrExactMatch(u.Host, domain) {
+			return true
+		}
+	}
+
+	// Check against full URLs
+	for _, storedURL := range globalMatcher.urls {
+		if storedURL.String() == rawURL {
+			return true
+		}
+		// If the stored URL has no query, path, or fragment, we greedily match (sub)domain
+		if storedURL.RawQuery == "" && storedURL.Path == "" && storedURL.Fragment == "" && isSubdomainOrExactMatch(u.Host, storedURL.Host) {
+			return true
+		}
+	}
+
+	// Check against regex patterns
+	for _, re := range globalMatcher.regexes {
+		if re.MatchString(rawURL) {
+			return true
+		}
+	}
+
+	return false
+}
+
+// Check if a string is a naive domain (e.g., "example.com")
+func isNaiveDomain(s string) bool {
+	// A naive domain should not contain a scheme, path, or query
+	if strings.Contains(s, "://") || strings.Contains(s, "/") || strings.Contains(s, "?") || strings.Contains(s, "#") {
+		return false
+	}
+	// Check if it has a dot and no spaces
+	return strings.Contains(s, ".") && !strings.Contains(s, " ")
+}
+
+// isSubdomainOrExactMatch checks if the given host is a subdomain or an exact match of the domain
+func isSubdomainOrExactMatch(host, domain string) bool {
+	// Exact match
+	if host == domain {
+		return true
+	}
+
+	// Subdomain match (e.g., "sub.example.com" matches "example.com")
+	if strings.HasSuffix(host, "."+domain) {
+		return true
+	}
+
+	return false
+}