From 51b2474b3cf7488ead37402a5ae5a08c4ee5472d Mon Sep 17 00:00:00 2001 From: Nitin Mishra Date: Mon, 11 Oct 2021 16:17:23 +0100 Subject: [PATCH] Issue #231 - How does worker pick a site after crash? - Configurable claimed limit as it was hard coded to 60. The nodes in case of crash can come back in fairly quick time. --- brozzler/cli.py | 5 +++++ brozzler/frontier.py | 6 +++--- brozzler/worker.py | 5 +++-- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/brozzler/cli.py b/brozzler/cli.py index fb973e32..828c700e 100755 --- a/brozzler/cli.py +++ b/brozzler/cli.py @@ -327,6 +327,10 @@ def brozzler_worker(argv=None): help=( 'when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) + arg_parser.add_argument( + '--claimed-limit', metavar='int', type=int, default=60, choices=range(15, 61), dest='claimed_limit', + help=('Minutes after worker crash, a site can be reclaimed' + 'an integer in the range 15..60 (default: 60)')) arg_parser.add_argument( '--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) @@ -370,6 +374,7 @@ def dump_state(signum, frame): frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, warcprox_auto=args.warcprox_auto, + claimed_limit=args.claimed_limit, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) diff --git a/brozzler/frontier.py b/brozzler/frontier.py index 6715eb32..35c875bd 100644 --- a/brozzler/frontier.py +++ b/brozzler/frontier.py @@ -93,7 +93,7 @@ def _vet_result(self, result, **kwargs): raise UnexpectedDbResult("expected %r to be %r in %r" % ( k, expected, result)) - def claim_sites(self, n=1): + def claim_sites(self, n=1, claimed_limit=60): self.logger.trace('claiming up to %s sites to brozzle', n) result = ( self.rr.table('sites').get_all(r.args( @@ -114,7 +114,7 @@ def claim_sites(self, n=1): r.and_( r.or_( site['claimed'].not_(), - site['last_claimed'].lt(r.now().sub(60*60))), + site['last_claimed'].lt(r.now().sub(claimed_limit*60))), r.or_( site.has_fields('max_claimed_sites').not_(), new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))), @@ -127,7 +127,7 @@ def claim_sites(self, n=1): r.branch( r.or_( r.row['claimed'].not_(), - r.row['last_claimed'].lt(r.now().sub(60*60))), + r.row['last_claimed'].lt(r.now().sub(claimed_limit*60))), {'claimed': True, 'last_claimed': r.now()}, {}), return_changes=True)).run() diff --git a/brozzler/worker.py b/brozzler/worker.py index d88893b7..89cbf1da 100644 --- a/brozzler/worker.py +++ b/brozzler/worker.py @@ -49,7 +49,7 @@ class BrozzlerWorker: def __init__( self, frontier, service_registry=None, max_browsers=1, - chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, + chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, claimed_limit=60, skip_extract_outlinks=False, skip_visit_hashtags=False, skip_youtube_dl=False, simpler404=False, screenshot_full_page=False, page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60, @@ -62,6 +62,7 @@ def __init__( self._proxy = proxy assert not (warcprox_auto and proxy) self._proxy_is_warcprox = None + self._claimed_limit = claimed_limit self._skip_extract_outlinks = skip_extract_outlinks self._skip_visit_hashtags = skip_visit_hashtags self._skip_youtube_dl = skip_youtube_dl @@ -488,7 +489,7 @@ def _start_browsing_some_sites(self): browsers = self._browser_pool.acquire_multi( (self._browser_pool.num_available() + 1) // 2) try: - sites = self._frontier.claim_sites(len(browsers)) + sites = self._frontier.claim_sites(len(browsers), self._claimed_limit) except: self._browser_pool.release_all(browsers) raise