-
Notifications
You must be signed in to change notification settings - Fork 17
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* postprocess: add shouldExtractAssets function to hold the logic * rename postprocess.go to item.go * domainscrawl: add package with base functions and unit tests * config: remove error return from compileRegexes as it was always returned nil and regexp.MustCompile will panicif a regexp can't compile * cmd&config: handle domains crawl flag and passing of values * archiver: rationalized ProcessBody arguments and adapted it to the changes of DomainsCrawl config element * domainscrawl: add Enabled() func to check if the package is enabled (has received elements) * domainscrawl: add more tests * postprocessor: refactor the conditional logic for outlinks extraction with the new domainscrawl package * cmd: change domains-crawl flag desc * postprocessor: correct the completion of an item based on the depth without rediretion (assets of assets) * domainscrawl: make Match() accept a string and parse the url with fasturl cause postprocessor will never have net/url.URL parsed urls to check against domainscrawl * archiver: reverted processBody() changes * hotfix pause package from a segfault
- Loading branch information
Showing
12 changed files
with
490 additions
and
22 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
145 changes: 145 additions & 0 deletions
145
internal/pkg/postprocessor/domainscrawl/domainscrawl.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
// Package domainscrawl is a postprocessing component that parse domains from a given input and stores them for later matching. | ||
// It can store naive domains, full URLs, and regex patterns. It can then check if a given URL matches any of the stored patterns. | ||
package domainscrawl | ||
|
||
import ( | ||
"net/url" | ||
"regexp" | ||
"strings" | ||
"sync" | ||
|
||
"github.com/ImVexed/fasturl" | ||
) | ||
|
||
type matchEngine struct { | ||
sync.RWMutex | ||
enabled bool | ||
regexes []*regexp.Regexp | ||
domains []string | ||
urls []url.URL | ||
} | ||
|
||
var ( | ||
globalMatcher = &matchEngine{ | ||
enabled: false, | ||
regexes: make([]*regexp.Regexp, 0), | ||
domains: make([]string, 0), | ||
urls: make([]url.URL, 0), | ||
} | ||
) | ||
|
||
// Reset the matcher to its initial state | ||
func Reset() { | ||
globalMatcher.Lock() | ||
defer globalMatcher.Unlock() | ||
|
||
globalMatcher.enabled = false | ||
globalMatcher.regexes = make([]*regexp.Regexp, 0) | ||
globalMatcher.domains = make([]string, 0) | ||
globalMatcher.urls = make([]url.URL, 0) | ||
} | ||
|
||
// Enabled returns true if the domainscrawl matcher is enabled | ||
func Enabled() bool { | ||
globalMatcher.RLock() | ||
defer globalMatcher.RUnlock() | ||
|
||
return globalMatcher.enabled | ||
} | ||
|
||
// AddElements takes a slice of strings, heuristically determines their type, and stores them | ||
func AddElements(elements []string) error { | ||
globalMatcher.Lock() | ||
defer globalMatcher.Unlock() | ||
|
||
if !globalMatcher.enabled { | ||
globalMatcher.enabled = true | ||
} | ||
|
||
for _, element := range elements { | ||
// Try to parse as a URL first | ||
parsedURL, err := url.Parse(element) | ||
if err == nil && parsedURL.Scheme != "" && parsedURL.Host != "" { | ||
// If it has a scheme and host, it's a full URL | ||
globalMatcher.urls = append(globalMatcher.urls, *parsedURL) | ||
continue | ||
} | ||
|
||
// Check if it's a naive domain (e.g., "example.com") | ||
if isNaiveDomain(element) { | ||
globalMatcher.domains = append(globalMatcher.domains, element) | ||
continue | ||
} | ||
|
||
// Otherwise, assume it's a regex | ||
re, err := regexp.Compile(element) | ||
if err != nil { | ||
return err | ||
} | ||
globalMatcher.regexes = append(globalMatcher.regexes, re) | ||
} | ||
return nil | ||
} | ||
|
||
// Match checks if a given URL matches any of the stored patterns | ||
func Match(rawURL string) bool { | ||
u, err := fasturl.ParseURL(rawURL) | ||
if err != nil { | ||
return false | ||
} | ||
|
||
globalMatcher.RLock() | ||
defer globalMatcher.RUnlock() | ||
|
||
// Check against naive domains | ||
for _, domain := range globalMatcher.domains { | ||
if isSubdomainOrExactMatch(u.Host, domain) { | ||
return true | ||
} | ||
} | ||
|
||
// Check against full URLs | ||
for _, storedURL := range globalMatcher.urls { | ||
if storedURL.String() == rawURL { | ||
return true | ||
} | ||
// If the stored URL has no query, path, or fragment, we greedily match (sub)domain | ||
if storedURL.RawQuery == "" && storedURL.Path == "" && storedURL.Fragment == "" && isSubdomainOrExactMatch(u.Host, storedURL.Host) { | ||
return true | ||
} | ||
} | ||
|
||
// Check against regex patterns | ||
for _, re := range globalMatcher.regexes { | ||
if re.MatchString(rawURL) { | ||
return true | ||
} | ||
} | ||
|
||
return false | ||
} | ||
|
||
// Check if a string is a naive domain (e.g., "example.com") | ||
func isNaiveDomain(s string) bool { | ||
// A naive domain should not contain a scheme, path, or query | ||
if strings.Contains(s, "://") || strings.Contains(s, "/") || strings.Contains(s, "?") || strings.Contains(s, "#") { | ||
return false | ||
} | ||
// Check if it has a dot and no spaces | ||
return strings.Contains(s, ".") && !strings.Contains(s, " ") | ||
} | ||
|
||
// isSubdomainOrExactMatch checks if the given host is a subdomain or an exact match of the domain | ||
func isSubdomainOrExactMatch(host, domain string) bool { | ||
// Exact match | ||
if host == domain { | ||
return true | ||
} | ||
|
||
// Subdomain match (e.g., "sub.example.com" matches "example.com") | ||
if strings.HasSuffix(host, "."+domain) { | ||
return true | ||
} | ||
|
||
return false | ||
} |
Oops, something went wrong.