Skip to content

Commit

Permalink
fix: incorrect crawl speed unpausing behavior (#144)
Browse files Browse the repository at this point in the history
* fix: incorrect crawl speed unpausing behavior

* rescope variable

* fix: remove unused variables

* fix: move variable back inside of function
  • Loading branch information
NGTmeaty authored Sep 1, 2024
1 parent 5e14690 commit 8cdd3d9
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions internal/pkg/crawl/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,25 @@ var regexOutlinks *regexp.Regexp

func (c *Crawl) crawlSpeedLimiter() {
maxConcurrentAssets := c.MaxConcurrentAssets
var pauseTriggeredByCrawlSpeed = false

for {
// Pause if the waitgroup has exceeded 8 times the active workers.
if c.Client.WaitGroup.Size() > int(*c.ActiveWorkers)*8 {
c.Paused.Set(true)
c.Queue.Paused.Set(true)
} else if c.Client.WaitGroup.Size() > int(*c.ActiveWorkers)*4 {
pauseTriggeredByCrawlSpeed = true
// Lower the number of concurrent assets we'll capture if the waitgroup exceeds 4 times the active workers (and the pause is caused by crawlSpeed)
} else if c.Client.WaitGroup.Size() > int(*c.ActiveWorkers)*4 && pauseTriggeredByCrawlSpeed {
c.MaxConcurrentAssets = 1
c.Paused.Set(false)
c.Queue.Paused.Set(false)
} else {
// If the pause was triggered by crawlSpeed and everything is fine, fully reset state.
} else if pauseTriggeredByCrawlSpeed {
c.MaxConcurrentAssets = maxConcurrentAssets
c.Paused.Set(false)
c.Queue.Paused.Set(false)
pauseTriggeredByCrawlSpeed = false
}

time.Sleep(time.Second / 10)
Expand Down

0 comments on commit 8cdd3d9

Please sign in to comment.