Features and QoL

If you choose to scrape all users, the script logic now does the following: 1. Scrape ALL users' links 2. Download ALL the users' links [config] Added scrape_all. Set to true if you'd like to scrape all the names. The script can now run automatically if you fill in auto_site_choice, auto_choice and auto_scrape_all. 5 second delay has been implemented between each task so you don't get a 404.
UltimaHoarder · Nov 7, 2019 · 37f8c46 · 37f8c46
1 parent ebfa317
commit 37f8c46
Show file tree

Hide file tree

Showing 6 changed files with 180 additions and 130 deletions.
diff --git a/README.md b/README.md
@@ -89,7 +89,13 @@ auto_choice:
 
     You can automatically choose what you want to scrape if you add it in the config file.
 
-|**NEW**| export_type:
+|**NEW**| auto_scrape_all:
+
+    Default = false
+
+    If set to true, the script will scrape all the names.
+
+export_type:
 
     Default = "json"
 

diff --git a/StartDatascraper.py b/StartDatascraper.py
@@ -8,6 +8,7 @@
 import timeit
 import json
 import logging
+import time
 
 # Configure logging to the console and file system at INFO level and above
 logging.basicConfig(handlers=[logging.FileHandler('application.log', 'w', 'utf-8')], level=logging.INFO,
@@ -48,6 +49,8 @@
             site_name = site_names[x]
         json_auth = json_sites[site_name]["auth"]
         json_site_settings = json_sites[site_name]["settings"]
+        auto_scrape_all = json_site_settings["auto_scrape_all"]
+        only_links = json_site_settings["auto_scrape_all"]
         session = ""
         x = ""
         app_token = ""
@@ -77,20 +80,29 @@
         names = array[0]
         if names:
             print("Names: "+array[1])
-            value = int(input().strip())
+            if not scrape_all:
+                value = int(input().strip())
+            else:
+                value = 0
             if value:
                 names = [names[value]]
             else:
                 names.pop(0)
         else:
             print('Input a '+site_name+' '+session[1])
             names = [input().strip()]
+        start_time = timeit.default_timer()
+        download_list = []
         for name in names:
             username = helpers.parse_links(site_name, name)
-            start_time = timeit.default_timer()
             result = x.start_datascraper(
                 session[0], username, site_name, app_token)
-            stop_time = str(int(timeit.default_timer() - start_time) / 60)
-            print('Task Completed in ' + stop_time + ' Minutes')
+            download_list.append(result)
+        for y in download_list:
+            for arg in y[1]:
+                x.download_media(*arg)
+        stop_time = str(int(timeit.default_timer() - start_time) / 60)
+        print('Task Completed in ' + stop_time + ' Minutes')
+        time.sleep(5)
 except KeyboardInterrupt as e:
     print("Exiting Script")
diff --git a/config.json b/config.json
@@ -14,6 +14,7 @@
       },
       "settings": {
         "auto_choice": "",
+        "auto_scrape_all": false,
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
         "text_length": "",
@@ -29,6 +30,7 @@
       },
       "settings": {
         "auto_choice": "",
+        "auto_scrape_all": false,
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
         "text_length": "",
@@ -41,6 +43,7 @@
       "auth": {},
       "settings": {
         "auto_choice": "",
+        "auto_scrape_all": false,
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
         "text_length": "",

diff --git a/modules/four_chan.py b/modules/four_chan.py
@@ -40,6 +40,7 @@
 
 
 def start_datascraper(session, board_name, site_name, link_type=None):
+    print("Scrape Processing")
     user_id = link_check(session, board_name)
     if not user_id[0]:
         print(user_id[1])
@@ -69,14 +70,14 @@ def start_datascraper(session, board_name, site_name, link_type=None):
     threads = pool.starmap(thread_scraper,
                            product(threads, [board_name], [session], [directory]))
     threads = [x for x in threads if x is not None]
-    print("Filtered Count: "+str(len(threads)))
+    post_count = len(threads)
+    print("Valid Count: "+str(post_count))
     print("Downloading Media")
-    results = pool.starmap(download_media,
-                           product(threads, [session], [directory], [board_name]))
     count_results = str(len([x for x in threads if x is None]))
-    print("Valid Count: "+count_results)
+    print("Invalid Count: "+count_results)
+    prep_download = [[threads, session, directory, board_name]]
     # When profile is done scraping, this function will return True
-    return [True, link_array]
+    return [True, prep_download]
 
 
 def link_check(session, username):
@@ -173,69 +174,79 @@ def thread_scraper(thread_id, board_name, session, directory):
     return thread
 
 
-def download_media(thread, session, directory, board_name):
-    try:
-        directory = thread["download_path"]+"/"
-        valid = False
-        name_key = "filename"
-        for post in thread["posts"]:
-            if name_key in post:
-                post["tim"] = str(post["tim"])
-                post[name_key] = re.sub(
-                    r'[\\/*?:"<>|]', '', post[name_key])
-                ext = post["ext"].replace(".", "")
-                filename = post["tim"]+"."+ext
-                link = "http://i.4cdn.org/" + board_name + "/" + filename
-                filename = post[name_key]+"."+ext
-                download_path = directory+filename
-                count_string = len(download_path)
-                if count_string > maximum_length:
-                    num_sum = count_string - maximum_length
-                    name_key = "tim"
-                    download_path = directory+post[name_key]+"."+ext
+def download_media(media_set, session, directory, board_name):
+    def download(thread, session, directory):
+        try:
+            directory = thread["download_path"]+"/"
+            valid = False
+            name_key = "filename"
+            for post in thread["posts"]:
+                if name_key in post:
+                    post["tim"] = str(post["tim"])
+                    post[name_key] = re.sub(
+                        r'[\\/*?:"<>|]', '', post[name_key])
+                    ext = post["ext"].replace(".", "")
+                    filename = post["tim"]+"."+ext
+                    link = "http://i.4cdn.org/" + board_name + "/" + filename
+                    filename = post[name_key]+"."+ext
+                    download_path = directory+filename
+                    count_string = len(download_path)
+                    if count_string > maximum_length:
+                        num_sum = count_string - maximum_length
+                        name_key = "tim"
+                        download_path = directory+post[name_key]+"."+ext
 
-                if not overwrite_files:
-                    count = 1
-                    found = False
-                    og_filename = post[name_key]
-                    while True:
-                        if os.path.isfile(download_path):
-                            remote_size = post["fsize"]
-                            local_size = os.path.getsize(download_path)
-                            if remote_size == local_size:
-                                found = True
-                                break
+                    if not overwrite_files:
+                        count = 1
+                        found = False
+                        og_filename = post[name_key]
+                        while True:
+                            if os.path.isfile(download_path):
+                                remote_size = post["fsize"]
+                                local_size = os.path.getsize(download_path)
+                                if remote_size == local_size:
+                                    found = True
+                                    break
+                                else:
+                                    download_path = directory+og_filename + \
+                                        " ("+str(count)+")."+ext
+                                    count += 1
+                                    continue
                             else:
-                                download_path = directory+og_filename + \
-                                    " ("+str(count)+")."+ext
-                                count += 1
-                                continue
-                        else:
-                            found = False
-                            break
-                    if found:
-                        continue
-                r = session.get(link, stream=True)
-                if r.status_code != 404:
-                    if not os.path.exists(os.path.dirname(download_path)):
-                        os.makedirs(os.path.dirname(download_path))
-                    with open(download_path, 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=1024):
-                            if chunk:  # filter out keep-alive new chunks
-                                f.write(chunk)
-                    logger.info("Link: {}".format(link))
-                    logger.info("Path: {}".format(download_path))
-                    valid = True
-        if valid:
-            os.makedirs(directory, exist_ok=True)
-            with open(directory+'archive.json', 'w') as outfile:
-                json.dump(thread, outfile)
-            return thread
-        else:
+                                found = False
+                                break
+                        if found:
+                            continue
+                    r = session.get(link, stream=True)
+                    if r.status_code != 404:
+                        if not os.path.exists(os.path.dirname(download_path)):
+                            os.makedirs(os.path.dirname(download_path))
+                        with open(download_path, 'wb') as f:
+                            for chunk in r.iter_content(chunk_size=1024):
+                                if chunk:  # filter out keep-alive new chunks
+                                    f.write(chunk)
+                        logger.info("Link: {}".format(link))
+                        logger.info("Path: {}".format(download_path))
+                        valid = True
+            if valid:
+                os.makedirs(directory, exist_ok=True)
+                with open(directory+'archive.json', 'w') as outfile:
+                    json.dump(thread, outfile)
+                return thread
+            else:
+                return
+        except Exception as e:
+            print("ERROR", e, directory)
             return
-    except Exception as e:
-        print("ERROR", e, directory)
-        return
+    print("Download Processing")
+    print("Name: "+board_name)
+    print("Directory: " + directory)
+    # print("Downloading "+post_count+" "+location)
+    if multithreading:
+        pool = ThreadPool(max_threads)
+    else:
+        pool = ThreadPool(1)
+    pool.starmap(download, product(media_set, [session], [directory]))
 
 
 def create_session():

diff --git a/modules/justforfans.py b/modules/justforfans.py
@@ -38,6 +38,7 @@
 
 
 def start_datascraper(session, username, site_name, app_token=None):
+    print("Scrape Processing")
     user_id = link_check(session, username)
     if not user_id[0]:
         print(user_id[1])
@@ -47,23 +48,23 @@ def start_datascraper(session, username, site_name, app_token=None):
     post_count = user_id[2]
     array = scrape_choice(username, post_count)
     link_array = {}
+    prep_download = []
     for item in array:
         item[1].append(username)
         only_links = item[1][4]
+        post_count = str(item[1][5])
         item[1].pop(3)
         response = media_scraper(session, site_name, only_links, *item[1])
         link_array[item[1][1].lower()] = response[0]
         if not only_links:
             media_set = response[0]
+            if not media_set["valid"]:
+                continue
             directory = response[1]
-            if multithreading:
-                pool = ThreadPool(max_threads)
-            else:
-                pool = ThreadPool(1)
-            pool.starmap(download_media, product(
-                media_set["valid"], [session], [directory], [username]))
+            location = item[1][1]
+            prep_download.append([media_set["valid"], session, directory, username, post_count, location])
     # When profile is done scraping, this function will return True
-    return [True, link_array]
+    return [True, prep_download]
 
 
 def link_check(session, username):
@@ -259,28 +260,38 @@ def media_scraper(session, site_name, only_links, link, location, media_type, di
     return [media_set, directory]
 
 
-def download_media(media, session, directory, username):
-    while True:
-        link = media["link"]
-        r = session.head(link)
+def download_media(media_set, session, directory, username, post_count, location):
+    def download(media, session, directory, username):
+        while True:
+            link = media["link"]
+            r = session.head(link)
 
-        date_object = datetime.strptime(media["postedAt"], "%d-%m-%Y %H:%M:%S")
-        directory = media["directory"]+media["filename"]
-        timestamp = date_object.timestamp()
-        if not overwrite_files:
-            if os.path.isfile(directory):
-                return
-        if not os.path.exists(os.path.dirname(directory)):
-            os.makedirs(os.path.dirname(directory))
-        r = session.get(link, allow_redirects=True, stream=True)
-        with open(directory, 'wb') as f:
-            for chunk in r.iter_content(chunk_size=1024):
-                if chunk:  # filter out keep-alive new chunks
-                    f.write(chunk)
-        format_image(directory, timestamp)
-        logger.info("Link: {}".format(link))
-        logger.info("Path: {}".format(directory))
-        return True
+            date_object = datetime.strptime(media["postedAt"], "%d-%m-%Y %H:%M:%S")
+            directory = media["directory"]+media["filename"]
+            timestamp = date_object.timestamp()
+            if not overwrite_files:
+                if os.path.isfile(directory):
+                    return
+            if not os.path.exists(os.path.dirname(directory)):
+                os.makedirs(os.path.dirname(directory))
+            r = session.get(link, allow_redirects=True, stream=True)
+            with open(directory, 'wb') as f:
+                for chunk in r.iter_content(chunk_size=1024):
+                    if chunk:  # filter out keep-alive new chunks
+                        f.write(chunk)
+            format_image(directory, timestamp)
+            logger.info("Link: {}".format(link))
+            logger.info("Path: {}".format(directory))
+            return True
+    print("Download Processing")
+    print("Name: "+username)
+    print("Directory: " + directory)
+    print("Downloading "+post_count+" "+location)
+    if multithreading:
+        pool = ThreadPool(max_threads)
+    else:
+        pool = ThreadPool(1)
+    pool.starmap(download, product(media_set, [session], [directory], [username]))
 
 
 def create_session(user_agent, phpsessid, user_hash2):