Skip to content

Commit

Permalink
Issue internetarchive#231 - How does worker pick a site after crash?
Browse files Browse the repository at this point in the history
- Configurable claimed limit as it was hard coded to 60. The nodes in case of crash can come back in fairly quick time.
  • Loading branch information
Nitin Mishra committed Oct 11, 2021
1 parent 4f301f4 commit 51b2474
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 5 deletions.
5 changes: 5 additions & 0 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,10 @@ def brozzler_worker(argv=None):
help=(
'when needed, choose an available instance of warcprox from '
'the rethinkdb service registry'))
arg_parser.add_argument(
'--claimed-limit', metavar='int', type=int, default=60, choices=range(15, 61), dest='claimed_limit',
help=('Minutes after worker crash, a site can be reclaimed'
'an integer in the range 15..60 (default: 60)'))
arg_parser.add_argument(
'--skip-extract-outlinks', dest='skip_extract_outlinks',
action='store_true', help=argparse.SUPPRESS)
Expand Down Expand Up @@ -370,6 +374,7 @@ def dump_state(signum, frame):
frontier, service_registry, max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe, proxy=args.proxy,
warcprox_auto=args.warcprox_auto,
claimed_limit=args.claimed_limit,
skip_extract_outlinks=args.skip_extract_outlinks,
skip_visit_hashtags=args.skip_visit_hashtags,
skip_youtube_dl=args.skip_youtube_dl)
Expand Down
6 changes: 3 additions & 3 deletions brozzler/frontier.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def _vet_result(self, result, **kwargs):
raise UnexpectedDbResult("expected %r to be %r in %r" % (
k, expected, result))

def claim_sites(self, n=1):
def claim_sites(self, n=1, claimed_limit=60):
self.logger.trace('claiming up to %s sites to brozzle', n)
result = (
self.rr.table('sites').get_all(r.args(
Expand All @@ -114,7 +114,7 @@ def claim_sites(self, n=1):
r.and_(
r.or_(
site['claimed'].not_(),
site['last_claimed'].lt(r.now().sub(60*60))),
site['last_claimed'].lt(r.now().sub(claimed_limit*60))),
r.or_(
site.has_fields('max_claimed_sites').not_(),
new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
Expand All @@ -127,7 +127,7 @@ def claim_sites(self, n=1):
r.branch(
r.or_(
r.row['claimed'].not_(),
r.row['last_claimed'].lt(r.now().sub(60*60))),
r.row['last_claimed'].lt(r.now().sub(claimed_limit*60))),
{'claimed': True, 'last_claimed': r.now()},
{}),
return_changes=True)).run()
Expand Down
5 changes: 3 additions & 2 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class BrozzlerWorker:

def __init__(
self, frontier, service_registry=None, max_browsers=1,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, claimed_limit=60,
skip_extract_outlinks=False, skip_visit_hashtags=False,
skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
Expand All @@ -62,6 +62,7 @@ def __init__(
self._proxy = proxy
assert not (warcprox_auto and proxy)
self._proxy_is_warcprox = None
self._claimed_limit = claimed_limit
self._skip_extract_outlinks = skip_extract_outlinks
self._skip_visit_hashtags = skip_visit_hashtags
self._skip_youtube_dl = skip_youtube_dl
Expand Down Expand Up @@ -488,7 +489,7 @@ def _start_browsing_some_sites(self):
browsers = self._browser_pool.acquire_multi(
(self._browser_pool.num_available() + 1) // 2)
try:
sites = self._frontier.claim_sites(len(browsers))
sites = self._frontier.claim_sites(len(browsers), self._claimed_limit)
except:
self._browser_pool.release_all(browsers)
raise
Expand Down

0 comments on commit 51b2474

Please sign in to comment.