From 51b2474b3cf7488ead37402a5ae5a08c4ee5472d Mon Sep 17 00:00:00 2001
From: Nitin Mishra <nitin.mishra@globalrelay.net>
Date: Mon, 11 Oct 2021 16:17:23 +0100
Subject: [PATCH] Issue #231 - How does worker pick a site after crash? -
 Configurable claimed limit as it was hard coded to 60. The nodes in case of
 crash can come back in fairly quick time.

---
 brozzler/cli.py      | 5 +++++
 brozzler/frontier.py | 6 +++---
 brozzler/worker.py   | 5 +++--
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/brozzler/cli.py b/brozzler/cli.py
index fb973e32..828c700e 100755
--- a/brozzler/cli.py
+++ b/brozzler/cli.py
@@ -327,6 +327,10 @@ def brozzler_worker(argv=None):
             help=(
                 'when needed, choose an available instance of warcprox from '
                 'the rethinkdb service registry'))
+    arg_parser.add_argument(
+                '--claimed-limit', metavar='int', type=int, default=60, choices=range(15, 61), dest='claimed_limit',
+                help=('Minutes after worker crash, a site can be reclaimed'
+                'an integer in the range 15..60 (default: 60)'))
     arg_parser.add_argument(
             '--skip-extract-outlinks', dest='skip_extract_outlinks',
             action='store_true', help=argparse.SUPPRESS)
@@ -370,6 +374,7 @@ def dump_state(signum, frame):
             frontier, service_registry, max_browsers=int(args.max_browsers),
             chrome_exe=args.chrome_exe, proxy=args.proxy,
             warcprox_auto=args.warcprox_auto,
+            claimed_limit=args.claimed_limit,
             skip_extract_outlinks=args.skip_extract_outlinks,
             skip_visit_hashtags=args.skip_visit_hashtags,
             skip_youtube_dl=args.skip_youtube_dl)
diff --git a/brozzler/frontier.py b/brozzler/frontier.py
index 6715eb32..35c875bd 100644
--- a/brozzler/frontier.py
+++ b/brozzler/frontier.py
@@ -93,7 +93,7 @@ def _vet_result(self, result, **kwargs):
                     raise UnexpectedDbResult("expected %r to be %r in %r" % (
                         k, expected, result))
 
-    def claim_sites(self, n=1):
+    def claim_sites(self, n=1, claimed_limit=60):
         self.logger.trace('claiming up to %s sites to brozzle', n)
         result = (
             self.rr.table('sites').get_all(r.args(
@@ -114,7 +114,7 @@ def claim_sites(self, n=1):
                         r.and_(
                             r.or_(
                                 site['claimed'].not_(),
-                                site['last_claimed'].lt(r.now().sub(60*60))),
+                                site['last_claimed'].lt(r.now().sub(claimed_limit*60))),
                             r.or_(
                                 site.has_fields('max_claimed_sites').not_(),
                                 new_acc[site['job_id'].coerce_to('string')].le(site['max_claimed_sites']))),
@@ -127,7 +127,7 @@ def claim_sites(self, n=1):
                 r.branch(
                     r.or_(
                       r.row['claimed'].not_(),
-                      r.row['last_claimed'].lt(r.now().sub(60*60))),
+                      r.row['last_claimed'].lt(r.now().sub(claimed_limit*60))),
                     {'claimed': True, 'last_claimed': r.now()},
                     {}),
                 return_changes=True)).run()
diff --git a/brozzler/worker.py b/brozzler/worker.py
index d88893b7..89cbf1da 100644
--- a/brozzler/worker.py
+++ b/brozzler/worker.py
@@ -49,7 +49,7 @@ class BrozzlerWorker:
 
     def __init__(
             self, frontier, service_registry=None, max_browsers=1,
-            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None,
+            chrome_exe="chromium-browser", warcprox_auto=False, proxy=None, claimed_limit=60,
             skip_extract_outlinks=False, skip_visit_hashtags=False,
             skip_youtube_dl=False, simpler404=False, screenshot_full_page=False,
             page_timeout=300, behavior_timeout=900, extract_outlinks_timeout=60,
@@ -62,6 +62,7 @@ def __init__(
         self._proxy = proxy
         assert not (warcprox_auto and proxy)
         self._proxy_is_warcprox = None
+        self._claimed_limit = claimed_limit
         self._skip_extract_outlinks = skip_extract_outlinks
         self._skip_visit_hashtags = skip_visit_hashtags
         self._skip_youtube_dl = skip_youtube_dl
@@ -488,7 +489,7 @@ def _start_browsing_some_sites(self):
         browsers = self._browser_pool.acquire_multi(
                 (self._browser_pool.num_available() + 1) // 2)
         try:
-            sites = self._frontier.claim_sites(len(browsers))
+            sites = self._frontier.claim_sites(len(browsers), self._claimed_limit)
         except:
             self._browser_pool.release_all(browsers)
             raise