janw · janw · Oct 30, 2024 · Oct 30, 2024
diff --git a/.github/workflows/bump-version.yaml b/.github/workflows/bump-version.yaml
@@ -9,6 +9,6 @@ jobs:
   bump-version:
     uses: janw/workflows/.github/workflows/commitizen-bump-version.yaml@main
     secrets:
-      personal-access-token: ${{ secrets.PERSONAL_ACCESS_TOKEN }}
+      personal-access-token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}
       gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
       gpg-passphrase: ${{ secrets.GPG_PASSPHRASE }}
diff --git a/.github/workflows/linters.yaml b/.github/workflows/linters.yaml
@@ -6,3 +6,49 @@ on:
 jobs:
   commitizen:
     uses: janw/workflows/.github/workflows/commitizen.yaml@main
+
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out
+        uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.BOT_PERSONAL_ACCESS_TOKEN }}
+
+      - name: Install poetry
+        run: pipx install poetry
+
+      - name: Set up python environment
+        uses: actions/setup-python@v5
+        with:
+          cache: poetry
+          python-version: 3.x
+
+      - name: Install dependencies
+        run: poetry install --sync
+
+      - id: cache-restore
+        uses: actions/cache/restore@v4
+        with:
+          path: ~/.cache/pre-commit
+          key: pre-commit-v0|${{ steps.setup-python.outputs.python-version }}|${{ hashFiles('.pre-commit-config.yaml') }}
+
+      - run: poetry run pre-commit run --show-diff-on-failure --color=always --all-files
+        shell: bash
+
+      - uses: stefanzweifel/git-auto-commit-action@v5
+        if: >
+          always()
+          && !startsWith(github.event.head_commit.message, 'build(autofix):')
+        with:
+          commit_message: "build(autofix): Auto-fix linting issues"
+          commit_user_name: "Jan Willhaus [bot]"
+          commit_user_email: "[email protected]"
+          commit_author: Jan Willhaus [bot] <[email protected]>
+
+      - id: cache-save
+        uses: actions/cache/save@v4
+        if: always() && steps.cache-restore.outputs.cache-hit != 'true'
+        with:
+          key: ${{ steps.cache-restore.outputs.cache-primary-key }}
+          path: ~/.cache/pre-commit
diff --git a/.gitignore b/.gitignore
@@ -85,6 +85,7 @@ ipython_config.py
 
 # pyenv
 .python-version
+.tool-versions
 
 # celery beat schedule file
 celerybeat-schedule

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,16 +1,15 @@
-ci:
-    autoupdate_commit_msg: 'build(pre-commit): pre-commit.ci autoupdate'
-    autoupdate_schedule: weekly
-    autofix_commit_msg: 'ci(pre-commit): auto fixes from pre-commit hooks'
-    autofix_prs: true
-
-default_install_hook_types:
-  - pre-commit
-default_stages:
-  - pre-commit
 repos:
+  - repo: meta
+    hooks:
+      - id: check-hooks-apply
+
+  - repo: https://github.com/janw/pre-commit-hooks
+    rev: v0.1.0
+    hooks:
+      - id: sync_ruff_version
+
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: 'v0.6.9'
+    rev: 'v0.7.1'
     hooks:
       - id: ruff
         args: [ --fix, --exit-non-zero-on-fix ]
@@ -28,4 +27,17 @@ repos:
   - repo: https://github.com/python-poetry/poetry
     rev: '1.8.0'
     hooks:
+      - id: poetry-lock
+        args:
+          - --no-update
       - id: poetry-check
+
+  - repo: local
+    hooks:
+      - id: mypy
+        name: mypy
+        entry: poetry run mypy
+        language: system
+        require_serial: true
+        pass_filenames: false
+        types: [python]
diff --git a/letterboxd_rss/__init__.py b/letterboxd_rss/__init__.py
@@ -1,111 +1 @@
-import re
-
-from requests import session
-from bs4 import BeautifulSoup
-from feedgen.feed import FeedGenerator
-
-match_imdb = re.compile(r"^https?://www.imdb.com")
-match_tmdb = re.compile(r"^https?://www.themoviedb.org")
-
-base_url = "https://letterboxd.com/"
-
-s = session()
-
-
-def process(args):
-    watchlist_url = args.letterboxd_url.rstrip("/")
-    if not watchlist_url.startswith("https://"):
-        watchlist_url = f"{base_url}{watchlist_url}"
-    if not watchlist_url.endswith("watchlist"):
-        watchlist_url += "/watchlist"
-    watchlist_url += "/"
-
-    feedlen = args.max_length
-    output_file = args.output
-    page_title = "The Dude's Watchlist"
-
-    feed = FeedGenerator()
-    feed.title(page_title)
-    feed.id(watchlist_url)
-    feed.link(href=watchlist_url, rel="alternate")
-    feed.description(page_title + " from Letterboxd")
-
-    # Get first page, gather general data
-    r = s.get(watchlist_url)
-    r.raise_for_status()
-    soup = BeautifulSoup(r.text, "html.parser")
-
-    watchlist_title = soup.find("meta", attrs={"property": "og:title"})
-    page_title = watchlist_title.attrs["content"]
-
-    m = soup.find("span", attrs={"class": "js-watchlist-count"})
-    if len(m) > 0:
-        total_movies = int(m.text.split()[0])
-        print(f"Found a total of {total_movies} movies")
-
-    paginator = soup.find_all("li", attrs={"class": "paginate-page"})
-    page_count = int(paginator[-1].text) if paginator else 1
-    last_page_index = page_count + 1
-
-    movies_added = 0
-    for page in range(1, last_page_index):
-        if page > 1:
-            r = s.get(watchlist_url + "/page/%i/" % page)
-            soup = BeautifulSoup(r.text, "html.parser")
-            print()
-
-        ul = soup.find("ul", attrs={"class": "poster-list"})
-        movies = ul.find_all("li")
-        movies_on_page = len(movies)
-
-        print(f"Gathering on page {page} (contains {movies_on_page} movies)\n")
-
-        for movie in movies:
-            added = extract_metadata(movie, feed)
-
-            # Update total counter
-            movies_added += added
-            if feedlen > 0 and movies_added >= feedlen:
-                print("\nReached desired maximum feed length")
-                break
-
-        if feedlen > 0 and movies_added >= feedlen:
-            break
-
-    if movies_added > 0:
-        print(f"Writing feed to {output_file}")
-        feed.rss_file(output_file)
-
-
-def extract_metadata(movie, feed):
-    movie_url = base_url + "film/" + movie.div.attrs["data-film-slug"]
-    movie_page = s.get(movie_url)
-    movie_soup = BeautifulSoup(movie_page.text, "html.parser")
-
-    try:
-        movie_title = movie_soup.find("meta", attrs={"property": "og:title"}).attrs[
-            "content"
-        ]
-        print("Adding", movie_title)
-        movie_link = movie_soup.find(
-            "a", attrs={"href": [match_imdb, match_tmdb]}
-        ).attrs["href"]
-        if movie_link.endswith("/maindetails"):
-            movie_link = movie_link[:-11]
-        movie_description = movie_soup.find(
-            "meta", attrs={"property": "og:description"}
-        )
-        if movie_description is not None:
-            movie_description = movie_description.text.strip()
-
-        item = feed.add_item()
-        item.title(movie_title)
-        item.description(movie_description)
-        item.link(href=movie_link, rel="alternate")
-        item.guid(movie_link)
-
-        return 1
-    except Exception:
-        print("Parsing failed on", movie_url)
-
-    return 0
+__version__ = "v0.3.0"
diff --git a/letterboxd_rss/base.py b/letterboxd_rss/base.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from concurrent.futures import Future, ThreadPoolExecutor, wait
+from typing import TYPE_CHECKING, Dict, List, Optional
+
+from bs4 import BeautifulSoup
+from bs4.element import Tag
+
+from letterboxd_rss.feed import create_feed
+from letterboxd_rss.parsing import parse_page
+from letterboxd_rss.session import session
+from letterboxd_rss.utils import make_watchlist_url
+
+if TYPE_CHECKING:
+    from feedgen.feed import FeedEntry
+
+
+def process(
+    letterboxd_url: str,
+    output_file: str,
+    max_length: int,
+) -> None:
+    page_title = ""
+    watchlist_url = make_watchlist_url(letterboxd_url)
+    next_url: Optional[str] = watchlist_url + "page/1/"
+    remaining_count = max_length
+    with ThreadPoolExecutor(max_workers=4) as pool:
+        future_to_url: Dict[Future[FeedEntry], str] = {}
+
+        while next_url and remaining_count > 0:
+            r = session.get_and_raise(next_url)
+            soup = BeautifulSoup(r.text, "html.parser")
+
+            next_url, _futures = parse_page(soup, max_movies=remaining_count, pool=pool)
+            future_to_url.update(_futures)
+            remaining_count -= len(_futures)
+
+    entries: List[FeedEntry] = []
+    for future in wait(future_to_url).done:
+        url = future_to_url[future]
+        try:
+            entry = future.result()
+        except Exception as exc:
+            print("%r generated an exception: %s" % (url, exc))
+        else:
+            entries.append(entry)
+
+    watchlist_title = soup.find("meta", attrs={"property": "og:title"})
+    page_title = watchlist_title.attrs["content"] if isinstance(watchlist_title, Tag) else "The Dude's Watchlist"
+
+    if entries:
+        create_feed(
+            entries,
+            page_title=page_title,
+            watchlist_url=watchlist_url,
+            output_file=output_file,
+        )
diff --git a/letterboxd_rss/__main__.py → letterboxd_rss/cli.py b/letterboxd_rss/__main__.py → letterboxd_rss/cli.py
@@ -1,9 +1,12 @@
-import sys
+from __future__ import annotations
+
 import argparse
-from letterboxd_rss import process
+from typing import List, Optional
+
+from letterboxd_rss.base import process
 
 
-def main(argv=None):
+def main(argv: Optional[List[str]] = None) -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "letterboxd_url",
@@ -26,7 +29,8 @@ def main(argv=None):
         help="Maximum number of watchlist items to keep in the feed",
     )
     args = parser.parse_args(argv)
-    process(args)
-
-
-main(sys.argv[1:])
+    process(
+        letterboxd_url=args.letterboxd_url,
+        output_file=args.output,
+        max_length=args.max_length,
+    )
diff --git a/letterboxd_rss/constants.py b/letterboxd_rss/constants.py
@@ -0,0 +1,9 @@
+from letterboxd_rss import __version__
+
+PROG_NAME = "letterboxd-rss"
+USER_AGENT = f"{PROG_NAME}/{__version__} (https://github.com/janw/{PROG_NAME})"
+
+REQUESTS_TIMEOUT = 30
+
+
+BASE_URL = "https://letterboxd.com"
diff --git a/letterboxd_rss/feed.py b/letterboxd_rss/feed.py
@@ -0,0 +1,18 @@
+from __future__ import annotations
+
+from typing import List
+
+from feedgen.feed import FeedEntry, FeedGenerator
+
+
+def create_feed(entries: List[FeedEntry], page_title: str, watchlist_url: str, output_file: str) -> None:
+    feed = FeedGenerator()
+    feed.title(page_title)
+    feed.id(watchlist_url)
+    feed.link(href=watchlist_url, rel="alternate")
+    feed.description(page_title + " from Letterboxd")
+    for entry in entries:
+        feed.add_entry(entry)
+
+    print(f"Writing feed to {output_file}")
+    feed.rss_file(output_file)