Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Seed-level video capture setting handling + Job-level PDF-only option #288

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 0 additions & 15 deletions brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -544,27 +544,12 @@ def dump_state(signum, frame):
finally:
signal.signal(signal.SIGQUIT, dump_state)

def get_skip_av_seeds():
galgeek marked this conversation as resolved.
Show resolved Hide resolved
# TODO: develop UI and refactor
SKIP_AV_SEEDS_FILE = "/opt/local/brozzler/skip_av_seeds.txt"
try:
# make set from seed IDs in SKIP_AV_SEEDS_FILE
with open(SKIP_AV_SEEDS_FILE) as skips:
skip_av_seeds = {int(l) for l in skips.readlines()}
logging.info("running with skip_av_seeds file %s" % SKIP_AV_SEEDS_FILE)
except Exception as e:
skip_av_seeds = set()
logging.info("running with empty skip_av_seeds")
return skip_av_seeds

rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,
Expand Down
47 changes: 36 additions & 11 deletions brozzler/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import uuid
import yaml
import zlib
from enum import Enum
from typing import Optional


Expand Down Expand Up @@ -100,6 +101,8 @@ def new_job(frontier, job_conf):
job.id = job_conf["id"]
if "max_claimed_sites" in job_conf:
job.max_claimed_sites = job_conf["max_claimed_sites"]
if "pdfs_only" in job_conf:
job.pdfs_only = job_conf["pdfs_only"]
job.save()

sites = []
Expand Down Expand Up @@ -198,6 +201,8 @@ class Job(doublethink.Document, ElapsedMixIn):
def populate_defaults(self):
if not "status" in self:
self.status = "ACTIVE"
if "pdfs_only" not in self:
self.pdfs_only = False
if not "starts_and_stops" in self:
if self.get("started"): # backward compatibility
self.starts_and_stops = [
Expand All @@ -220,33 +225,53 @@ def finish(self):
self.starts_and_stops[-1]["stop"] = doublethink.utcnow()


class VideoCaptureOptions(Enum):
"""
Enumeration of possible values for the `video_capture` config key.
- ENABLE_VIDEO_CAPTURE (default): All video is captured.
- DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
- BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header
containing the word "video" is not captured.
- DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.

Note: Ensuring full video MIME type blocking requires an additional entry in the
Warcprox-Meta header `mime-type-filters` key.
"""

ENABLE_VIDEO_CAPTURE = "ENABLE_VIDEO_CAPTURE"
DISABLE_VIDEO_CAPTURE = "DISABLE_VIDEO_CAPTURE"
BLOCK_VIDEO_MIME_TYPES = "BLOCK_VIDEO_MIME_TYPES"
DISABLE_YTDLP_CAPTURE = "DISABLE_YTDLP_CAPTURE"


class Site(doublethink.Document, ElapsedMixIn):
logger = logging.getLogger(__module__ + "." + __qualname__)
table = "sites"

def populate_defaults(self):
if not "status" in self:
if "status" not in self:
self.status = "ACTIVE"
if not "claimed" in self:
if "claimed" not in self:
self.claimed = False
if not "last_disclaimed" in self:
if "last_disclaimed" not in self:
self.last_disclaimed = brozzler.EPOCH_UTC
if not "last_claimed" in self:
if "last_claimed" not in self:
self.last_claimed = brozzler.EPOCH_UTC
if not "scope" in self:
if "scope" not in self:
self.scope = {}
if not "skip_ytdlp" in self:
self.skip_ytdlp = None
if "video_capture" not in self:
self.video_capture = VideoCaptureOptions.ENABLE_VIDEO_CAPTURE.value

# backward compatibility
if "surt" in self.scope:
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
self.scope["accepts"].append({"surt": self.scope["surt"]})
del self.scope["surt"]

# backward compatibility
if "max_hops_off_surt" in self.scope and not "max_hops_off" in self.scope:
if "max_hops_off_surt" in self.scope and "max_hops_off" not in self.scope:
self.scope["max_hops_off"] = self.scope["max_hops_off_surt"]
if "max_hops_off_surt" in self.scope:
del self.scope["max_hops_off_surt"]
Expand All @@ -256,7 +281,7 @@ def populate_defaults(self):
brozzler.site_surt_canon(self.seed).ssurt().decode("ascii")
)

if not "starts_and_stops" in self:
if "starts_and_stops" not in self:
if self.get("start_time"): # backward compatibility
self.starts_and_stops = [
{"start": self.get("start_time"), "stop": None}
Expand All @@ -271,7 +296,7 @@ def __str__(self):
return 'Site({"id":"%s","seed":"%s",...})' % (self.id, self.seed)

def _accept_ssurt_if_not_redundant(self, ssurt):
if not "accepts" in self.scope:
if "accepts" not in self.scope:
self.scope["accepts"] = []
simple_rule_ssurts = (
rule["ssurt"]
Expand Down
43 changes: 34 additions & 9 deletions brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import logging
import brozzler
import brozzler.browser
from brozzler.model import VideoCaptureOptions
import threading
import time
import urllib.request
Expand Down Expand Up @@ -54,7 +55,6 @@ def __init__(
self,
frontier,
service_registry=None,
skip_av_seeds=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
Expand All @@ -74,7 +74,6 @@ def __init__(
):
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._max_browsers = max_browsers

self._warcprox_auto = warcprox_auto
Expand Down Expand Up @@ -250,7 +249,17 @@ def brozzle_page(

if not self._needs_browsing(page_headers):
self.logger.info("needs fetch: %s", page)
self._fetch_url(site, page=page)
if site.video_capture in [
galgeek marked this conversation as resolved.
Show resolved Hide resolved
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.BLOCK_VIDEO_MIME_TYPES.value,
] and self._is_video_type(page_headers):
self.logger.info(
"skipping video content: video MIME type capture disabled for site"
)
elif site.pdfs_only and not self._is_pdf(page_headers):
self.logger.info("skipping non-PDF content: PDFs only option enabled")
else:
self._fetch_url(site, page=page)
else:
self.logger.info("needs browsing: %s", page)
try:
Expand All @@ -262,7 +271,7 @@ def brozzle_page(
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(
site, page, browser.websock_thread.page_status, self._skip_av_seeds
site, page, browser.websock_thread.page_status
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
Expand Down Expand Up @@ -303,13 +312,29 @@ def _get_page_headers(self, page):
self.logger.warning("Failed to get headers for %s: %s", page.url, e)
return {}

def _needs_browsing(self, page_headers):
if (
def _needs_browsing(self, page_headers) -> bool:
return not bool(
"content-type" in page_headers
and "html" not in page_headers["content-type"]
):
return False
return True
)

def _is_video_type(self, page_headers) -> bool:
"""
Determines if the page's Content-Type header specifies that it contains
a video.
"""
return (
"content-type" in page_headers and "video" in page_headers["content-type"]
)

def _is_pdf(self, page_headers) -> bool:
"""
Determinse if the page's Content-Type header specifies that it is a PDF.
gretchenleighmiller marked this conversation as resolved.
Show resolved Hide resolved
"""
return (
"content-type" in page_headers
and "application/pdf" in page_headers["content-type"]
)

def _browse_page(self, browser, site, page, on_screenshot=None, on_request=None):
def _on_screenshot(screenshot_jpeg):
Expand Down
25 changes: 7 additions & 18 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import yt_dlp
from yt_dlp.utils import match_filter_func
import brozzler
from brozzler.model import VideoCaptureOptions
import urllib.request
import tempfile
import urlcanon
Expand All @@ -32,36 +33,24 @@
thread_local = threading.local()


def should_ytdlp(site, page, page_status, skip_av_seeds):
def should_ytdlp(site, page, page_status):
# called only after we've passed needs_browsing() check

if page_status != 200:
logging.info("skipping ytdlp: non-200 page status %s", page_status)
return False
if site.skip_ytdlp:
logging.info("skipping ytdlp: site marked skip_ytdlp")
if site.video_capture in [
VideoCaptureOptions.DISABLE_VIDEO_CAPTURE.value,
VideoCaptureOptions.DISABLE_YTDLP_CAPTURE.value,
]:
logging.info("skipping ytdlp: site has video capture disabled")
return False

ytdlp_url = page.redirect_url if page.redirect_url else page.url

if "chrome-error:" in ytdlp_url:
return False

ytdlp_seed = (
site["metadata"]["ait_seed_id"]
if "metadata" in site and "ait_seed_id" in site["metadata"]
else None
)

# TODO: develop UI and refactor
if ytdlp_seed:
if site.skip_ytdlp is None and ytdlp_seed in skip_av_seeds:
logging.info("skipping ytdlp: site in skip_av_seeds")
site.skip_ytdlp = True
return False
else:
site.skip_ytdlp = False

return True


Expand Down
31 changes: 31 additions & 0 deletions job-conf.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,18 @@ Puts a cap on the number of sites belonging to a given job that can be brozzled
simultaneously across the cluster. Addresses the problem of a job with many
seeds starving out other jobs.

``pdfs_only``
galgeek marked this conversation as resolved.
Show resolved Hide resolved
~~~~~~~~~~~~~~~~~~~~~
+---------+----------+-----------+
| type | required | default |
+=========+==========+===========+
| boolean | no | ``false`` |
+---------+----------+-----------+
Limits capture to PDFs based on MIME type. This value will only impact
galgeek marked this conversation as resolved.
Show resolved Hide resolved
processing of outlinks within Brozzler. Fully limiting a crawl to only PDFs
requires an additional entry in the Warcprox-Meta header ``mime-type-filters``
key to fully block videos by MIME type.
gretchenleighmiller marked this conversation as resolved.
Show resolved Hide resolved

``seeds``
~~~~~~~~~
+------------------------+----------+---------+
Expand Down Expand Up @@ -158,6 +170,25 @@ other fields like checkboxes and/or hidden fields, brozzler will leave
the default values in place. Brozzler submits login forms after page load.
Then brozzling proceeds as usual.

``video_capture``
~~~~~~~~~~~~~~~~~
+--------+----------+--------------------------+
| type | required | default |
+========+==========+==========================+
| string | yes | ``ENABLE_VIDEO_CAPTURE`` |
+--------+----------+--------------------------+
Determines the level of video capture for the seed. This is an enumeration with four possible values:

* ENABLE_VIDEO_CAPTURE (default): All video is captured.
* DISABLE_VIDEO_CAPTURE: No video is captured. This is effectively a
combination of the next two values.
* BLOCK_VIDEO_MIME_TYPES: Any response with a Content-Type header containing
the word "video" is not captured.
* DISABLE_YTDLP_CAPTURE: Video capture via yt-dlp is disabled.

*Note: Ensuring full video MIME type blocking requires an additional entry in
the Warcprox-Meta header `mime-type-filters` key.*

Seed-level / top-level settings
-------------------------------
These are seed settings that can also be specified at the top level, in which
Expand Down