Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add parallel processing #11

Merged
merged 1 commit into from
Oct 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/bump-version.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ jobs:
bump-version:
uses: janw/workflows/.github/workflows/commitizen-bump-version.yaml@main
secrets:
personal-access-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
personal-access-token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}
gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
46 changes: 46 additions & 0 deletions .github/workflows/linters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,49 @@ on:
jobs:
commitizen:
uses: janw/workflows/.github/workflows/commitizen.yaml@main

pre-commit:
runs-on: ubuntu-latest
steps:
- name: Check out
uses: actions/checkout@v3
with:
token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}

- name: Install poetry
run: pipx install poetry

- name: Set up python environment
uses: actions/setup-python@v5
with:
cache: poetry
python-version: 3.x

- name: Install dependencies
run: poetry install --sync

- id: cache-restore
uses: actions/cache/restore@v4
with:
path: ~/.cache/pre-commit
key: pre-commit-v0|${{ steps.setup-python.outputs.python-version }}|${{ hashFiles('.pre-commit-config.yaml') }}

- run: poetry run pre-commit run --show-diff-on-failure --color=always --all-files
shell: bash

- uses: stefanzweifel/git-auto-commit-action@v5
if: >
always()
&& !startsWith(github.event.head_commit.message, 'build(autofix):')
with:
commit_message: "build(autofix): Auto-fix linting issues"
commit_user_name: "Jan Willhaus [bot]"
commit_user_email: "[email protected]"
commit_author: Jan Willhaus [bot] <[email protected]>

- id: cache-save
uses: actions/cache/save@v4
if: always() && steps.cache-restore.outputs.cache-hit != 'true'
with:
key: ${{ steps.cache-restore.outputs.cache-primary-key }}
path: ~/.cache/pre-commit
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ ipython_config.py

# pyenv
.python-version
.tool-versions

# celery beat schedule file
celerybeat-schedule
Expand Down
34 changes: 23 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
ci:
autoupdate_commit_msg: 'build(pre-commit): pre-commit.ci autoupdate'
autoupdate_schedule: weekly
autofix_commit_msg: 'ci(pre-commit): auto fixes from pre-commit hooks'
autofix_prs: true

default_install_hook_types:
- pre-commit
default_stages:
- pre-commit
repos:
- repo: meta
hooks:
- id: check-hooks-apply

- repo: https://github.com/janw/pre-commit-hooks
rev: v0.1.0
hooks:
- id: sync_ruff_version

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: 'v0.6.9'
rev: 'v0.7.1'
hooks:
- id: ruff
args: [ --fix, --exit-non-zero-on-fix ]
Expand All @@ -28,4 +27,17 @@ repos:
- repo: https://github.com/python-poetry/poetry
rev: '1.8.0'
hooks:
- id: poetry-lock
args:
- --no-update
- id: poetry-check

- repo: local
hooks:
- id: mypy
name: mypy
entry: poetry run mypy
language: system
require_serial: true
pass_filenames: false
types: [python]
112 changes: 1 addition & 111 deletions letterboxd_rss/__init__.py
Original file line number Diff line number Diff line change
@@ -1,111 +1 @@
import re

from requests import session
from bs4 import BeautifulSoup
from feedgen.feed import FeedGenerator

match_imdb = re.compile(r"^https?://www.imdb.com")
match_tmdb = re.compile(r"^https?://www.themoviedb.org")

base_url = "https://letterboxd.com/"

s = session()


def process(args):
watchlist_url = args.letterboxd_url.rstrip("/")
if not watchlist_url.startswith("https://"):
watchlist_url = f"{base_url}{watchlist_url}"
if not watchlist_url.endswith("watchlist"):
watchlist_url += "/watchlist"
watchlist_url += "/"

feedlen = args.max_length
output_file = args.output
page_title = "The Dude's Watchlist"

feed = FeedGenerator()
feed.title(page_title)
feed.id(watchlist_url)
feed.link(href=watchlist_url, rel="alternate")
feed.description(page_title + " from Letterboxd")

# Get first page, gather general data
r = s.get(watchlist_url)
r.raise_for_status()
soup = BeautifulSoup(r.text, "html.parser")

watchlist_title = soup.find("meta", attrs={"property": "og:title"})
page_title = watchlist_title.attrs["content"]

m = soup.find("span", attrs={"class": "js-watchlist-count"})
if len(m) > 0:
total_movies = int(m.text.split()[0])
print(f"Found a total of {total_movies} movies")

paginator = soup.find_all("li", attrs={"class": "paginate-page"})
page_count = int(paginator[-1].text) if paginator else 1
last_page_index = page_count + 1

movies_added = 0
for page in range(1, last_page_index):
if page > 1:
r = s.get(watchlist_url + "/page/%i/" % page)
soup = BeautifulSoup(r.text, "html.parser")
print()

ul = soup.find("ul", attrs={"class": "poster-list"})
movies = ul.find_all("li")
movies_on_page = len(movies)

print(f"Gathering on page {page} (contains {movies_on_page} movies)\n")

for movie in movies:
added = extract_metadata(movie, feed)

# Update total counter
movies_added += added
if feedlen > 0 and movies_added >= feedlen:
print("\nReached desired maximum feed length")
break

if feedlen > 0 and movies_added >= feedlen:
break

if movies_added > 0:
print(f"Writing feed to {output_file}")
feed.rss_file(output_file)


def extract_metadata(movie, feed):
movie_url = base_url + "film/" + movie.div.attrs["data-film-slug"]
movie_page = s.get(movie_url)
movie_soup = BeautifulSoup(movie_page.text, "html.parser")

try:
movie_title = movie_soup.find("meta", attrs={"property": "og:title"}).attrs[
"content"
]
print("Adding", movie_title)
movie_link = movie_soup.find(
"a", attrs={"href": [match_imdb, match_tmdb]}
).attrs["href"]
if movie_link.endswith("/maindetails"):
movie_link = movie_link[:-11]
movie_description = movie_soup.find(
"meta", attrs={"property": "og:description"}
)
if movie_description is not None:
movie_description = movie_description.text.strip()

item = feed.add_item()
item.title(movie_title)
item.description(movie_description)
item.link(href=movie_link, rel="alternate")
item.guid(movie_link)

return 1
except Exception:
print("Parsing failed on", movie_url)

return 0
__version__ = "v0.3.0"
57 changes: 57 additions & 0 deletions letterboxd_rss/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from concurrent.futures import Future, ThreadPoolExecutor, wait
from typing import TYPE_CHECKING, Dict, List, Optional

from bs4 import BeautifulSoup
from bs4.element import Tag

from letterboxd_rss.feed import create_feed
from letterboxd_rss.parsing import parse_page
from letterboxd_rss.session import session
from letterboxd_rss.utils import make_watchlist_url

if TYPE_CHECKING:
from feedgen.feed import FeedEntry


def process(
letterboxd_url: str,
output_file: str,
max_length: int,
) -> None:
page_title = ""
watchlist_url = make_watchlist_url(letterboxd_url)
next_url: Optional[str] = watchlist_url + "page/1/"
remaining_count = max_length
with ThreadPoolExecutor(max_workers=4) as pool:
future_to_url: Dict[Future[FeedEntry], str] = {}

while next_url and remaining_count > 0:
r = session.get_and_raise(next_url)
soup = BeautifulSoup(r.text, "html.parser")

next_url, _futures = parse_page(soup, max_movies=remaining_count, pool=pool)
future_to_url.update(_futures)
remaining_count -= len(_futures)

entries: List[FeedEntry] = []
for future in wait(future_to_url).done:
url = future_to_url[future]
try:
entry = future.result()
except Exception as exc:
print("%r generated an exception: %s" % (url, exc))
else:
entries.append(entry)

watchlist_title = soup.find("meta", attrs={"property": "og:title"})
page_title = watchlist_title.attrs["content"] if isinstance(watchlist_title, Tag) else "The Dude's Watchlist"

if entries:
create_feed(
entries,
page_title=page_title,
watchlist_url=watchlist_url,
output_file=output_file,
)
18 changes: 11 additions & 7 deletions letterboxd_rss/__main__.py → letterboxd_rss/cli.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
import sys
from __future__ import annotations

import argparse
from letterboxd_rss import process
from typing import List, Optional

from letterboxd_rss.base import process


def main(argv=None):
def main(argv: Optional[List[str]] = None) -> None:
parser = argparse.ArgumentParser()
parser.add_argument(
"letterboxd_url",
Expand All @@ -26,7 +29,8 @@ def main(argv=None):
help="Maximum number of watchlist items to keep in the feed",
)
args = parser.parse_args(argv)
process(args)


main(sys.argv[1:])
process(
letterboxd_url=args.letterboxd_url,
output_file=args.output,
max_length=args.max_length,
)
9 changes: 9 additions & 0 deletions letterboxd_rss/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from letterboxd_rss import __version__

PROG_NAME = "letterboxd-rss"
USER_AGENT = f"{PROG_NAME}/{__version__} (https://github.com/janw/{PROG_NAME})"

REQUESTS_TIMEOUT = 30


BASE_URL = "https://letterboxd.com"
18 changes: 18 additions & 0 deletions letterboxd_rss/feed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from __future__ import annotations

from typing import List

from feedgen.feed import FeedEntry, FeedGenerator


def create_feed(entries: List[FeedEntry], page_title: str, watchlist_url: str, output_file: str) -> None:
feed = FeedGenerator()
feed.title(page_title)
feed.id(watchlist_url)
feed.link(href=watchlist_url, rel="alternate")
feed.description(page_title + " from Letterboxd")
for entry in entries:
feed.add_entry(entry)

print(f"Writing feed to {output_file}")
feed.rss_file(output_file)
Loading
Loading