diff --git a/README.md b/README.md index 905c215..d1f5073 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ Project Euler Offline ===================== All Project Euler problems, with MathJax and images, as a single PDF. Additional text files are provided. Get the releases [here](https://github.com/wxv/project-euler-offline/releases). -Please report any accuracy issues or give feedback. Thanks. +Please report any inaccuracies or give feedback. Thanks. Inspired by [Kyle Keen's original Local Euler](http://kmkeen.com/local-euler/2008-07-16-07-33-00.html). @@ -11,7 +11,8 @@ Installation and Usage Requirements: - PhantomJS (`apt install phantomjs`) - Node modules system, webpage (`npm install system webpage`) -- Python 3 and BeautifulSoup, PyPDF2 (`pip install beautifulsoup4 pypdf2`) +- Python 3 and PyPDF2, BeautifulSoup, Pillow (`pip install beautifulsoup4 pypdf2 pillow`) + - BeautifulSoup and Pillow are only required for downloading extra text and images (animated GIF only). My usage process (replace 1 and 628 with whatever range you like): @@ -19,5 +20,7 @@ My usage process (replace 1 and 628 with whatever range you like): python3 combine.py 1 628 // Optional: download solutions from https://github.com/luckytoilet/projecteuler-solutions cd render - zip problems problems.pdf *.txt + zip problems problems.pdf *.txt *.gif +Since each page is independent, it is possible to run multiple processes of +`capture.js` at once, each downloading a certain range. \ No newline at end of file diff --git a/combine.py b/combine.py index 846aa5c..2e50ef8 100644 --- a/combine.py +++ b/combine.py @@ -1,25 +1,44 @@ -from bs4 import BeautifulSoup -import requests import sys from os import sep from PyPDF2 import PdfFileMerger + render_dir = "render" -download_txt_flag = True +download_extra_flag = True + +def download_extra(url): + '''Tries to find a .txt attachment or .gif and download it to render_dir.''' + # TODO async request + from bs4 import BeautifulSoup + import requests + from os.path import basename + from PIL import Image + from io import BytesIO + + site_main = "http://projecteuler.net/" -def download_txt(url): - '''Tries to find a .txt attachment and download it to render_dir.''' print("Searching", url) content = requests.get(url).content soup = BeautifulSoup(content, "lxml") for a in soup.find_all('a', href=True): if a["href"].endswith(".txt"): print("Found", a["href"]) - r = requests.get("http://projecteuler.net/" + a["href"]) + r = requests.get(site_main + a["href"]) with open(render_dir + sep + a.text, 'wb') as f: f.write(r.content) + for img in soup.find_all("img"): + img_src = img["src"] + # Ignore spacer.gif (blank) + if img_src.endswith(".gif") and "spacer" not in img_src: + print("Found", img_src) + r = requests.get(site_main + img_src) + # Only write animated GIFs + if Image.open(BytesIO(r.content)).is_animated: + print("Writing", img_src) + with open(render_dir + sep + basename(img_src), 'wb') as f: + f.write(r.content) def main(): @@ -36,13 +55,13 @@ def main(): print("Merged PDFs") - if download_txt_flag: + if download_extra_flag: url_list = [] for problem_id in range(problem_id_start, problem_id_end+1): url_list.append("https://projecteuler.net/problem=" + str(problem_id)) for url in url_list: - download_txt(url) + download_extra(url) if __name__ == "__main__":