Skip to content

Commit

Permalink
yt-dlp proxy handling update
Browse files Browse the repository at this point in the history
  • Loading branch information
Barbara Miller committed Jan 23, 2025
1 parent eb922f5 commit 170377f
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 7 deletions.
17 changes: 16 additions & 1 deletion brozzler/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"""
brozzler/cli.py - brozzler command line executables
Copyright (C) 2014-2024 Internet Archive
Copyright (C) 2014-2025 Internet Archive
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -611,14 +611,29 @@ def get_skip_av_seeds():
logging.info("running with empty skip_av_seeds")
return skip_av_seeds

def get_proxy_endpoints():
PROXY_ENDPOINTS_FILE = "/opt/local/brozzler/proxy_endpoints.txt"
try:
# make list from file
with open(PROXY_ENDPOINTS_FILE) as endpoints:
proxy_endpoints = [l for l in endpoints.readlines()]
if proxy_endpoints:
logging.info("running with proxy endpoints file %s" % PROXY_ENDPOINTS_FILE)
except Exception as e:
proxy_endpoints = []
logging.info("running with empty proxy endpoints file")
return proxy_endpoints

rr = rethinker(args)
frontier = brozzler.RethinkDbFrontier(rr)
service_registry = doublethink.ServiceRegistry(rr)
skip_av_seeds_from_file = get_skip_av_seeds()
proxy_endpoints_from_file = get_proxy_endpoints()
worker = brozzler.worker.BrozzlerWorker(
frontier,
service_registry,
skip_av_seeds=skip_av_seeds_from_file,
proxy_endpoints=proxy_endpoints_from_file,
max_browsers=int(args.max_browsers),
chrome_exe=args.chrome_exe,
proxy=args.proxy,
Expand Down
4 changes: 3 additions & 1 deletion brozzler/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def __init__(
frontier,
service_registry=None,
skip_av_seeds=None,
proxy_endpoints=None,
max_browsers=1,
chrome_exe="chromium-browser",
warcprox_auto=False,
Expand All @@ -81,6 +82,7 @@ def __init__(
self._frontier = frontier
self._service_registry = service_registry
self._skip_av_seeds = skip_av_seeds
self._proxy_endpoints = proxy_endpoints
self._max_browsers = max_browsers

self._warcprox_auto = warcprox_auto
Expand Down Expand Up @@ -287,7 +289,7 @@ def brozzle_page(
self.logger.info("page interstitial shown (http auth): %s", page)

if enable_youtube_dl and ydl.should_ytdlp(
site, page, status_code, self._skip_av_seeds
site, page, status_code, self._skip_av_seeds, self._proxy_endpoints
):
try:
ydl_outlinks = ydl.do_youtube_dl(self, site, page)
Expand Down
9 changes: 4 additions & 5 deletions brozzler/ydl.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,11 @@
thread_local = threading.local()


YTDLP_PROXY = ""
PROXY_ATTEMPTS = 4
YTDLP_WAIT = 10


def should_ytdlp(site, page, page_status, skip_av_seeds):
def should_ytdlp(site, page, page_status, skip_av_seeds, proxy_endpoints):
# called only after we've passed needs_browsing() check

if page_status != 200:
Expand Down Expand Up @@ -285,11 +284,11 @@ def ydl_postprocess_hook(d):

ytdlp_url = page.redirect_url if page.redirect_url else page.url
is_youtube_host = isyoutubehost(ytdlp_url)
if is_youtube_host and YTDLP_PROXY:
ydl_opts["proxy"] = YTDLP_PROXY
if is_youtube_host and proxy_endpoints:
ydl_opts["proxy"] = random.choice(proxy_endpoints)
# don't log proxy value secrets
ytdlp_proxy_for_logs = (
YTDLP_PROXY.split("@")[1] if "@" in YTDLP_PROXY else "@@@"
ydl_opts["proxy"].split("@")[1] if "@" in ydl_opts["proxy"] else "@@@"
)
logging.info("using yt-dlp proxy ... %s", ytdlp_proxy_for_logs)

Expand Down

0 comments on commit 170377f

Please sign in to comment.