From 3d381a055b0064b8fe965b05bc1379d8f38974c7 Mon Sep 17 00:00:00 2001 From: jxu <7989982+jxu@users.noreply.github.com> Date: Tue, 28 Jul 2020 19:10:27 -0400 Subject: [PATCH] Refactor project to use single problems page Remove PhantomJS / javascript code Simplify and rename python script Update README --- README.md | 23 +++++++------- capture.js | 78 ----------------------------------------------- combine.py | 68 ----------------------------------------- download_extra.py | 49 +++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 157 deletions(-) delete mode 100644 capture.js delete mode 100644 combine.py create mode 100644 download_extra.py diff --git a/README.md b/README.md index 50b17d0..081676c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ Project Euler Offline ===================== -All Project Euler problems, with MathJax and images, as a single PDF. Additional text files are provided. Get the releases [here](https://github.com/wxv/project-euler-offline/releases). +All Project Euler problems, with MathJax and images, as a single PDF. Additional text files are provided. [Get the releases here.](https://github.com/wxv/project-euler-offline/releases) Please report any inaccuracies or give feedback. Thanks. @@ -8,19 +8,20 @@ Inspired by [Kyle Keen's original Local Euler](http://kmkeen.com/local-euler/200 Installation and Usage ---------------------- + +Note: previously PhantomJS was used to download each problem individually as a PDF, and PyPDF2 was used to combine together all problems. + +Now, use "Print to File" https://projecteuler.net/show=all using Firefox (with no Headers and Footers in options). This is simpler, produces a smaller PDF, and does not rely on the discontinued PhantomJS. The python script to download extra files remains the same functionally. + Requirements: -- PhantomJS (`apt install phantomjs`) -- Node modules system, webpage (`npm install system webpage`) -- Python 3 and PyPDF2, BeautifulSoup, lxml, Pillow (`pip install beautifulsoup4 lxml pypdf2 pillow`) +- Python 3 and BeautifulSoup, lxml, Pillow (`pip install beautifulsoup4 lxml pillow`) - BeautifulSoup and Pillow are only required for downloading extra text and images (animated GIF only). -My usage process (replace 1 and 628 with whatever range you like): +My usage process: - phantomjs capture.js 1 628 - python3 combine.py 1 628 - // Optional: download solutions from https://github.com/luckytoilet/projecteuler-solutions + mkdir render + # Save render/problems.pdf with Firefox as above + python3 download_extra.py cd render - zip problems problems.pdf *.txt *.gif + zip problems.zip problems.pdf *.txt *.gif -Since each page is independent, it is possible to run multiple processes of -`capture.js` at once, each downloading a certain range. \ No newline at end of file diff --git a/capture.js b/capture.js deleted file mode 100644 index 6e50e2b..0000000 --- a/capture.js +++ /dev/null @@ -1,78 +0,0 @@ -"use strict"; - -var renderDir = "render"; - - -var arrayOfUrls = []; -var system = require('system'); -var problemIDStart = system.args[1]; -var problemIDEnd = system.args[2]; - -for (var problemID = problemIDStart; problemID <= problemIDEnd; problemID++) { - arrayOfUrls.push("https://projecteuler.net/problem=" + problemID.toString()); -} - -/* -Render given urls -@param array of URLs to render -@param callbackPerUrl Function called after finishing each URL, including the last URL -@param callbackFinal Function called after finishing everything -*/ -var RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) { - var webpage = require("webpage"); - var page = null; - - var getFilename = function(url) { - return renderDir + '/' + url.split('=')[1] + ".pdf"; - }; - var next = function(status, url, file) { - page.close(); - callbackPerUrl(status, url, file); - return retrieve(); - }; - var retrieve = function() { - if (urls.length > 0) { - var url = urls.shift(); - page = webpage.create(); - page.viewportSize = { - width: 1000, - height: 1000 - }; - - page.paperSize = { - format: 'Letter', - margin: '50px' - }; - - page.settings.userAgent = "Project Euler Offline bot"; - - return page.open(url, function(status) { - var file = getFilename(url); - if (status === "success") { - return window.setTimeout((function() { - page.render(file); - return next(status, url, file); - }), 2000); - } else { - return next(status, url, file); - } - }); - } else { - return callbackFinal(); - } - }; - return retrieve(); -}; - - - -RenderUrlsToFile(arrayOfUrls, (function(status, url, file) { - if (status !== "success") { - return console.log("Unable to render '" + url + "'"); - } else { - return console.log("Rendered '" + url + "' at '" + file + "'"); - } -}), function() { - return phantom.exit(); -}); - diff --git a/combine.py b/combine.py deleted file mode 100644 index 8e0d373..0000000 --- a/combine.py +++ /dev/null @@ -1,68 +0,0 @@ -import sys -from os import sep -from PyPDF2 import PdfFileMerger - -RENDER_DIR = "render" -DOWNLOAD_EXTRA_FLAG = True -SITE_MAIN = "https://projecteuler.net/" - - -def download_extra(url): - """Finds if available a .txt attachment or animated .gif and downloads it - to RENDER_DIR - """ - # Not async for now to keep rate of requests low - from bs4 import BeautifulSoup - import requests - from os.path import basename - from PIL import Image - from io import BytesIO - - print("Searching", url) - content = requests.get(url).content - soup = BeautifulSoup(content, "lxml") - for a in soup.find_all('a', href=True): - href = a["href"] - if href.endswith(".txt"): - print("Found and writing", href) - r = requests.get(SITE_MAIN + href) - with open(RENDER_DIR + sep + a.text, 'wb') as f: - f.write(r.content) - - for img in soup.find_all("img"): - img_src = img["src"] - # Ignore spacer.gif (blank) - if img_src.endswith(".gif") and "spacer" not in img_src: - print("Found", img_src) - r = requests.get(SITE_MAIN + img_src) - # Only write animated GIFs - if Image.open(BytesIO(r.content)).is_animated: - print("Writing", img_src) - with open(RENDER_DIR + sep + basename(img_src), 'wb') as f: - f.write(r.content) - - -def main(): - problem_id_start = int(sys.argv[1]) - problem_id_end = int(sys.argv[2]) - - merger = PdfFileMerger() - - for problem_id in range(problem_id_start, problem_id_end+1): - pdf_path = RENDER_DIR + sep + str(problem_id) + ".pdf" - merger.append(pdf_path) - - merger.write(RENDER_DIR + sep + "problems.pdf") - print("Merged PDFs") - - if DOWNLOAD_EXTRA_FLAG: - url_list = [] - for problem_id in range(problem_id_start, problem_id_end+1): - url_list.append(SITE_MAIN + "problem=" + str(problem_id)) - - for url in url_list: - download_extra(url) - - -if __name__ == "__main__": - main() diff --git a/download_extra.py b/download_extra.py new file mode 100644 index 0000000..5c9d8db --- /dev/null +++ b/download_extra.py @@ -0,0 +1,49 @@ +import sys +from os import sep +# Not async for now to keep rate of requests low +from bs4 import BeautifulSoup +import requests +from os.path import basename +from PIL import Image +from io import BytesIO + + +RENDER_DIR = "render" +SITE_MAIN = "https://projecteuler.net/" + + +def download_extra(url): + """Finds if available a .txt attachment or animated .gif and downloads it + to RENDER_DIR + """ + content = requests.get(url).content + soup = BeautifulSoup(content, "lxml") + for a in soup.find_all('a', href=True): + href = a["href"] + if href.endswith(".txt"): + print("Writing", href) + r = requests.get(SITE_MAIN + href) + with open(RENDER_DIR + sep + basename(href), 'wb') as f: + f.write(r.content) + + for img in soup.find_all("img"): + img_src = img["src"] + + # Skip non-GIFs and spacer.gif + if not img_src.endswith(".gif") or img_src.endswith("spacer.gif"): + continue + + r = requests.get(SITE_MAIN + img_src) + # Only write animated GIFs + if Image.open(BytesIO(r.content)).is_animated: + print("Writing", img_src) + with open(RENDER_DIR + sep + basename(img_src), 'wb') as f: + f.write(r.content) + + +def main(): + download_extra("https://projecteuler.net/show=all") + + +if __name__ == "__main__": + main()