From 3d381a055b0064b8fe965b05bc1379d8f38974c7 Mon Sep 17 00:00:00 2001
From: jxu <7989982+jxu@users.noreply.github.com>
Date: Tue, 28 Jul 2020 19:10:27 -0400
Subject: [PATCH] Refactor project to use single problems page

Remove PhantomJS / javascript code
Simplify and rename python script
Update README
---
 README.md         | 23 +++++++-------
 capture.js        | 78 -----------------------------------------------
 combine.py        | 68 -----------------------------------------
 download_extra.py | 49 +++++++++++++++++++++++++++++
 4 files changed, 61 insertions(+), 157 deletions(-)
 delete mode 100644 capture.js
 delete mode 100644 combine.py
 create mode 100644 download_extra.py

diff --git a/README.md b/README.md
index 50b17d0..081676c 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 Project Euler Offline
 =====================
-All Project Euler problems, with MathJax and images, as a single PDF. Additional text files are provided. Get the releases [here](https://github.com/wxv/project-euler-offline/releases).
+All Project Euler problems, with MathJax and images, as a single PDF. Additional text files are provided. [Get the releases here.](https://github.com/wxv/project-euler-offline/releases)
 
 Please report any inaccuracies or give feedback. Thanks.
 
@@ -8,19 +8,20 @@ Inspired by [Kyle Keen's original Local Euler](http://kmkeen.com/local-euler/200
 
 Installation and Usage
 ----------------------
+
+Note: previously PhantomJS was used to download each problem individually as a PDF, and PyPDF2 was used to combine together all problems. 
+
+Now, use "Print to File" https://projecteuler.net/show=all using Firefox (with no Headers and Footers in options). This is simpler, produces a smaller PDF, and does not rely on the discontinued PhantomJS. The python script to download extra files remains the same functionally. 
+
 Requirements:
-- PhantomJS (`apt install phantomjs`)
-- Node modules system, webpage (`npm install system webpage`)
-- Python 3 and  PyPDF2, BeautifulSoup, lxml, Pillow (`pip install beautifulsoup4 lxml pypdf2 pillow`)
+- Python 3 and BeautifulSoup, lxml, Pillow (`pip install beautifulsoup4 lxml pillow`)
   - BeautifulSoup and Pillow are only required for downloading extra text and images (animated GIF only).
 
-My usage process (replace 1 and 628 with whatever range you like):
+My usage process:
 
-    phantomjs capture.js 1 628
-    python3 combine.py 1 628
-    // Optional: download solutions from https://github.com/luckytoilet/projecteuler-solutions
+    mkdir render
+    # Save render/problems.pdf with Firefox as above
+    python3 download_extra.py
     cd render
-    zip problems problems.pdf *.txt *.gif
+    zip problems.zip problems.pdf *.txt *.gif
 
-Since each page is independent, it is possible to run multiple processes of
-`capture.js` at once, each downloading a certain range.
\ No newline at end of file
diff --git a/capture.js b/capture.js
deleted file mode 100644
index 6e50e2b..0000000
--- a/capture.js
+++ /dev/null
@@ -1,78 +0,0 @@
-"use strict";
-
-var renderDir = "render";
-
-
-var arrayOfUrls = [];
-var system = require('system');
-var problemIDStart = system.args[1];
-var problemIDEnd = system.args[2];
-
-for (var problemID = problemIDStart; problemID <= problemIDEnd; problemID++) {
-    arrayOfUrls.push("https://projecteuler.net/problem=" + problemID.toString());
-}
-
-/*
-Render given urls
-@param array of URLs to render
-@param callbackPerUrl Function called after finishing each URL, including the last URL
-@param callbackFinal Function called after finishing everything
-*/
-var RenderUrlsToFile = function(urls, callbackPerUrl, callbackFinal) {
-    var webpage = require("webpage");
-    var page = null;
-
-    var getFilename = function(url) {
-        return renderDir + '/' + url.split('=')[1] + ".pdf";
-    };
-    var next = function(status, url, file) {
-        page.close();
-        callbackPerUrl(status, url, file);
-        return retrieve();
-    };
-    var retrieve = function() {
-        if (urls.length > 0) {
-            var url = urls.shift();
-            page = webpage.create();
-            page.viewportSize = {
-                width: 1000,
-                height: 1000
-            };
-
-            page.paperSize = {
-                format: 'Letter',
-                margin: '50px'
-            };
-
-            page.settings.userAgent = "Project Euler Offline bot";
-
-            return page.open(url, function(status) {
-                var file = getFilename(url);
-                if (status === "success") {
-                    return window.setTimeout((function() {
-                        page.render(file);
-                        return next(status, url, file);
-                    }), 2000);
-                } else {
-                    return next(status, url, file);
-                }
-            });
-        } else {
-            return callbackFinal();
-        }
-    };
-    return retrieve();
-};
-
-
-
-RenderUrlsToFile(arrayOfUrls, (function(status, url, file) {
-    if (status !== "success") {
-        return console.log("Unable to render '" + url + "'");
-    } else {
-        return console.log("Rendered '" + url + "' at '" + file + "'");
-    }
-}), function() {
-    return phantom.exit();
-});
-
diff --git a/combine.py b/combine.py
deleted file mode 100644
index 8e0d373..0000000
--- a/combine.py
+++ /dev/null
@@ -1,68 +0,0 @@
-import sys
-from os import sep
-from PyPDF2 import PdfFileMerger
-
-RENDER_DIR = "render"
-DOWNLOAD_EXTRA_FLAG = True
-SITE_MAIN = "https://projecteuler.net/"
-
-
-def download_extra(url):
-    """Finds if available a .txt attachment or animated .gif and downloads it
-    to RENDER_DIR
-    """
-    # Not async for now to keep rate of requests low
-    from bs4 import BeautifulSoup
-    import requests
-    from os.path import basename
-    from PIL import Image
-    from io import BytesIO
-
-    print("Searching", url)
-    content = requests.get(url).content
-    soup = BeautifulSoup(content, "lxml")
-    for a in soup.find_all('a', href=True):
-        href = a["href"]
-        if href.endswith(".txt"):
-            print("Found and writing", href)
-            r = requests.get(SITE_MAIN + href)
-            with open(RENDER_DIR + sep + a.text, 'wb') as f:
-                f.write(r.content)
-
-    for img in soup.find_all("img"):
-        img_src = img["src"]
-        # Ignore spacer.gif (blank)
-        if img_src.endswith(".gif") and "spacer" not in img_src:
-            print("Found", img_src)
-            r = requests.get(SITE_MAIN + img_src)
-            # Only write animated GIFs
-            if Image.open(BytesIO(r.content)).is_animated:
-                print("Writing", img_src)
-                with open(RENDER_DIR + sep + basename(img_src), 'wb') as f:
-                    f.write(r.content) 
-
-
-def main():
-    problem_id_start = int(sys.argv[1])
-    problem_id_end = int(sys.argv[2])
-
-    merger = PdfFileMerger()
-
-    for problem_id in range(problem_id_start, problem_id_end+1):
-        pdf_path = RENDER_DIR + sep + str(problem_id) + ".pdf"
-        merger.append(pdf_path)
-
-    merger.write(RENDER_DIR + sep + "problems.pdf")
-    print("Merged PDFs")
-
-    if DOWNLOAD_EXTRA_FLAG:
-        url_list = []
-        for problem_id in range(problem_id_start, problem_id_end+1):
-            url_list.append(SITE_MAIN + "problem=" + str(problem_id))
-
-        for url in url_list:
-            download_extra(url)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/download_extra.py b/download_extra.py
new file mode 100644
index 0000000..5c9d8db
--- /dev/null
+++ b/download_extra.py
@@ -0,0 +1,49 @@
+import sys
+from os import sep
+# Not async for now to keep rate of requests low
+from bs4 import BeautifulSoup
+import requests
+from os.path import basename
+from PIL import Image
+from io import BytesIO
+
+
+RENDER_DIR = "render"
+SITE_MAIN = "https://projecteuler.net/"
+
+
+def download_extra(url):
+    """Finds if available a .txt attachment or animated .gif and downloads it
+    to RENDER_DIR
+    """
+    content = requests.get(url).content
+    soup = BeautifulSoup(content, "lxml")
+    for a in soup.find_all('a', href=True):
+        href = a["href"]
+        if href.endswith(".txt"):
+            print("Writing", href)
+            r = requests.get(SITE_MAIN + href)
+            with open(RENDER_DIR + sep + basename(href), 'wb') as f:
+                f.write(r.content)
+
+    for img in soup.find_all("img"):
+        img_src = img["src"]
+
+        # Skip non-GIFs and spacer.gif
+        if not img_src.endswith(".gif") or img_src.endswith("spacer.gif"): 
+            continue
+
+        r = requests.get(SITE_MAIN + img_src)
+        # Only write animated GIFs
+        if Image.open(BytesIO(r.content)).is_animated:
+            print("Writing", img_src)
+            with open(RENDER_DIR + sep + basename(img_src), 'wb') as f:
+                f.write(r.content) 
+
+
+def main():    
+    download_extra("https://projecteuler.net/show=all")
+
+
+if __name__ == "__main__":
+    main()