From 69d83b4db57be311d97a4c89b29fb8bc0ccb93f5 Mon Sep 17 00:00:00 2001
From: CRIMINAL <luke-vaughan@hotmail.co.uk>
Date: Sun, 20 Oct 2019 19:23:39 +0100
Subject: [PATCH] Fixes and Features update

Fixes:
If your OS is not Windows, the script will not use any Windows functions

Script will now grab all media unless they are locked behind a paywall or duplicated. If there is any corrupt media, it will be exported to archive.json.

All media will download to their proper folders. (No more images in videos and vice-versa)

Features:

metadata/archive.json will now contain valid and invalid posts. (This will replace links.json)

Config Update:
ignored_keywords-
Any posts containing these words will be ignored.
text_length-
When using the {text} in file_name_format, you can set a maximum length.

boards-
Input any boards you'd like to automatically scrape.
---
 README.md              |  22 ++++
 Start Datascraper.py   |  18 ++-
 config.json            |  12 +-
 modules/four_chan.py   | 156 +++++++++++++++++++-----
 modules/helpers.py     |  46 +++++--
 modules/justforfans.py | 266 ++++++++++++++++++++---------------------
 modules/onlyfans.py    | 151 ++++++++++++-----------
 7 files changed, 417 insertions(+), 254 deletions(-)

diff --git a/README.md b/README.md
index 60b5897e6..aeeb78d03 100644
--- a/README.md
+++ b/README.md
@@ -59,6 +59,14 @@ file_name_format:
     Example: {date}/{text}-{file_name}.{ext}
     Warning: It's important to keep a unique identifier next to .{ext}. By default it's {file_name}, but it can be {date}-{text}.ext
     
+text_length:
+
+    Default = ""
+    Ideal = "50"
+    Max = "259"
+
+    When you use {text} in file_name_format, a limit of how many characters can be set by inputting a number.
+    
 auto_site_choice:
 
     Default = ""
@@ -101,6 +109,20 @@ multithreading:
     If set to false, you will download files 1 by 1. (If you don't have fast internet, may god help you.)
     I'd reccomend leaving it set to true.
 
+boards:
+
+    Default = []
+    Example = ["s", "gif"]
+
+    Input boards names that you want to automatically scrape.
+
+ignored_keywords:
+
+    Default = []
+    Example = ["ignore", "me"]
+
+    Any words you input, the script will ignore any content that contains these words.
+
 
 
 # OPTIONAL ARGUMENTS
diff --git a/Start Datascraper.py b/Start Datascraper.py
index a0c8219ec..3aba539d7 100644
--- a/Start Datascraper.py	
+++ b/Start Datascraper.py	
@@ -34,6 +34,7 @@
         x = int(input())
         site_name = site_names[x]
     json_auth = json_sites[site_name]["auth"]
+    json_site_settings = json_sites[site_name]["settings"]
     session = ""
     x = ""
     app_token = ""
@@ -43,22 +44,27 @@
         auth_hash = json_auth['auth_hash']
         x = onlyfans
         session = x.create_session(user_agent, auth_id, auth_hash, app_token)
+        array = []
     elif site_name == "justforfans":
         auth_id = json_auth['phpsessid']
         auth_hash = json_auth['user_hash2']
         x = justforfans
         session = x.create_session(user_agent, auth_id, auth_hash)
+        array = []
     elif site_name == "4chan":
         x = four_chan
         session = x.create_session()
+        array = json_site_settings["boards"]
 
     if not session[0]:
         continue
     print('Input a '+site_name+' '+session[1])
-    input_link = input().strip()
-    username = helpers.parse_links(site_name, input_link)
-    start_time = timeit.default_timer()
     session = session[0]
-    result = x.start_datascraper(session, username, site_name, app_token)
-    stop_time = str(int(timeit.default_timer() - start_time) / 60)
-    print('Task Completed in ' + stop_time + ' Minutes')
+    if not array:
+        array = [input().strip()]
+    for input_link in array:
+        username = helpers.parse_links(site_name, input_link)
+        start_time = timeit.default_timer()
+        result = x.start_datascraper(session, username, site_name, app_token)
+        stop_time = str(int(timeit.default_timer() - start_time) / 60)
+        print('Task Completed in ' + stop_time + ' Minutes')
diff --git a/config.json b/config.json
index b650f39bb..66fc95522 100644
--- a/config.json
+++ b/config.json
@@ -15,8 +15,10 @@
       "settings": {
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
+        "text_length": "",
         "overwrite_files": true,
-        "date_format": "%d-%m-%Y"
+        "date_format": "%d-%m-%Y",
+        "ignored_keywords": []
       }
     },
     "justforfans": {
@@ -27,8 +29,10 @@
       "settings": {
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
+        "text_length": "",
         "overwrite_files": true,
-        "date_format": "%d-%m-%Y"
+        "date_format": "%d-%m-%Y",
+        "ignored_keywords": []
       }
     },
     "4chan": {
@@ -36,9 +40,11 @@
       "settings": {
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
+        "text_length": "",
         "overwrite_files": false,
         "date_format": "%d-%m-%Y",
-        "ignore_thread_titles": [""]
+        "boards": [],
+        "ignored_keywords": []
       }
     }
 
diff --git a/modules/four_chan.py b/modules/four_chan.py
index becf579ae..016f8c351 100644
--- a/modules/four_chan.py
+++ b/modules/four_chan.py
@@ -27,6 +27,12 @@
 format_path = json_settings['file_name_format']
 overwrite_files = json_settings["overwrite_files"]
 date_format = json_settings["date_format"]
+ignored_keywords = json_settings["ignored_keywords"]
+maximum_length = 240
+text_length = int(json_settings["text_length"]
+                  ) if json_settings["text_length"] else maximum_length
+if text_length > maximum_length:
+    text_length = maximum_length
 
 max_threads = multiprocessing.cpu_count()
 
@@ -41,6 +47,7 @@ def start_datascraper(session, board_name, site_name, link_type=None):
         print(user_id[1])
         print("First time? Did you forget to edit your config.json file?")
         return [False]
+    print("Board: " + board_name)
     array = scrape_choice(board_name)
     link_array = {}
     if multithreading:
@@ -50,9 +57,7 @@ def start_datascraper(session, board_name, site_name, link_type=None):
     threads = board_scraper(session, array[0], "")
     archive_threads = board_scraper(session, array[1], "archive")
     threads = threads + archive_threads
-    print("Scraping Threads")
-    threads = pool.starmap(thread_scraper,
-                           product(threads, [board_name], [session]))
+    print("Original Count: "+str(len(threads)))
     directory = j_directory
     directory += "/"+site_name + "/" + board_name + "/"
     if "/sites/" == j_directory:
@@ -62,10 +67,16 @@ def start_datascraper(session, board_name, site_name, link_type=None):
     else:
         directory = directory
 
+    print("Scraping Threads")
+    threads = pool.starmap(thread_scraper,
+                           product(threads, [board_name], [session], [directory]))
+    threads = [x for x in threads if x is not None]
+    print("Filtered Count: "+str(len(threads)))
     print("Downloading Media")
-    pool.starmap(download_media,
-                 product(threads, [session], [directory], [board_name]))
-
+    results = pool.starmap(download_media,
+                           product(threads, [session], [directory], [board_name]))
+    count_results = str(len([x for x in threads if x is None]))
+    print("Valid Count: "+count_results)
     # When profile is done scraping, this function will return True
     return [True, link_array]
 
@@ -103,49 +114,128 @@ def board_scraper(session, link, category):
     return threads
 
 
-def thread_scraper(thread_id, board_name, session):
-    link = "http://a.4cdn.org/" + board_name + "/thread/" + str(
-        thread_id) + ".json"
+def thread_scraper(thread_id, board_name, session, directory):
+    thread_id = str(thread_id)
+    link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json"
     r = session.get(link)
-    y = json.loads(r.text)
-    return y
-
+    if r.status_code == 404:
+        return
+    try:
+        thread = json.loads(r.text)
+        thread_master = thread["posts"][0]
+    except Exception as e:
+        print(e, link)
+        return
+    if "archived" in thread_master:
+        location = "Archive"
+    else:
+        location = "Catalog"
 
-def download_media(thread, session, directory, board_name):
-    thread_master = thread["posts"][0]
-    thread_id = str(thread_master["no"])
+    if "sub" in thread_master:
+        title = thread_master["sub"].lower()
+        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
+            print("Removed From "+location+": ", title)
+            return
+
+    if "com" in thread_master:
+        title = thread_master["com"].lower()
+        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
+            print("Removed From "+location+": ", title)
+            return
     text = ""
     if "sub" in thread_master:
-        text = thread_master["sub"]
+        text = thread_master["sub"][:text_length]
     else:
-        if "com" in thread_master:
-            text = thread_master["com"]
+        text = thread_master["com"][:text_length]
     text = BeautifulSoup(text, 'html.parser').get_text().replace(
         "\n", " ").strip()
     text = re.sub(r'[\\/*?:"<>|]', '', text)
+    thread["download_path"] = ""
     for post in thread["posts"]:
+        if "name" not in post:
+            post["name"] = "Anonymous"
         if "filename" in post:
             filename = str(post["tim"])
             ext = post["ext"].replace(".", "")
-            link = "http://i.4cdn.org/" + board_name + "/" + filename+"."+ext
+            filename = post["filename"]
             new_directory = directory+"/"+text+" - "+thread_id+"/"
             if not text:
                 new_directory = new_directory.replace(" - ", "")
+
             date_object = datetime.fromtimestamp(post["time"])
-            new_directory = reformat(new_directory, filename, text, ext, date_object, post["name"], format_path,
-                                     date_format)
-            if not overwrite_files:
-                if os.path.isfile(new_directory):
-                    continue
-            r = session.get(link, stream=True)
-            if r.status_code != 404:
-                if not os.path.exists(os.path.dirname(new_directory)):
-                    os.makedirs(os.path.dirname(new_directory))
-                with open(new_directory, 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=1024):
-                        if chunk:  # filter out keep-alive new chunks
-                            f.write(chunk)
-                print(link, new_directory)
+            og_filename = filename
+            download_path = os.path.dirname(reformat(
+                new_directory, filename, text, ext, date_object, post["name"], format_path, date_format, text_length, maximum_length))
+            size = len(download_path)
+            size2 = len(thread["download_path"])
+            if thread["download_path"]:
+                if len(download_path) < len(thread["download_path"]):
+                    thread["download_path"] = download_path
+            else:
+                thread["download_path"] = download_path
+    return thread
+
+
+def download_media(thread, session, directory, board_name):
+    try:
+        directory = thread["download_path"]+"/"
+        valid = False
+        for post in thread["posts"]:
+            if "filename" in post:
+                post["filename"] = re.sub(
+                    r'[\\/*?:"<>|]', '', post["filename"])
+                ext = post["ext"].replace(".", "")
+                filename = str(post["tim"])+"."+ext
+                link = "http://i.4cdn.org/" + board_name + "/" + filename
+                filename = post["filename"]+"."+ext
+                download_path = directory+filename
+                count_string = len(download_path)
+                if count_string > 259:
+                    num_sum = count_string - 259
+                    post["filename"] = post["filename"][:50]
+                    download_path = directory+post["filename"]+"."+ext
+
+                if not overwrite_files:
+                    count = 1
+                    found = False
+                    og_filename = post["filename"]
+                    while True:
+                        if os.path.isfile(download_path):
+                            remote_size = post["fsize"]
+                            local_size = os.path.getsize(download_path)
+                            if remote_size == local_size:
+                                found = True
+                                break
+                            else:
+                                download_path = directory+og_filename + \
+                                    " ("+str(count)+")."+ext
+                                count += 1
+                                continue
+                        else:
+                            found = False
+                            break
+                    if found:
+                        continue
+                r = session.get(link, stream=True)
+                if r.status_code != 404:
+                    if not os.path.exists(os.path.dirname(download_path)):
+                        os.makedirs(os.path.dirname(download_path))
+                    with open(download_path, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=1024):
+                            if chunk:  # filter out keep-alive new chunks
+                                f.write(chunk)
+                    print(download_path)
+                    valid = True
+        if valid:
+            os.makedirs(directory, exist_ok=True)
+            with open(directory+'archive.json', 'w') as outfile:
+                json.dump(thread, outfile)
+            return thread
+        else:
+            return
+    except Exception as e:
+        print("ERROR", e, directory)
+        return
 
 
 def create_session():
diff --git a/modules/helpers.py b/modules/helpers.py
index 4630303b1..280421f65 100644
--- a/modules/helpers.py
+++ b/modules/helpers.py
@@ -1,5 +1,6 @@
-from bs4 import BeautifulSoup
 import re
+import os
+from bs4 import BeautifulSoup
 
 
 def parse_links(site_name, input_link):
@@ -19,21 +20,46 @@ def parse_links(site_name, input_link):
             return input_link
 
 
-def reformat(directory, file_name, text, ext, date, username, format_path, date_format):
+def reformat(directory, file_name, text, ext, date, username, format_path, date_format, text_length, maximum_length):
     path = format_path.replace("{username}", username)
-    text = BeautifulSoup(text, 'html.parser').get_text().replace("\n", " ").strip()
+    text = BeautifulSoup(text, 'html.parser').get_text().replace(
+        "\n", " ").strip()
     filtered_text = re.sub(r'[\\/*?:"<>|]', '', text)
     path = path.replace("{text}", filtered_text)
     date = date.strftime(date_format)
     path = path.replace("{date}", date)
     path = path.replace("{file_name}", file_name)
     path = path.replace("{ext}", ext)
-    directory += path
-    count_string = len(directory)
-    if count_string > 259:
-        num_sum = count_string - 259
-        directory = directory.replace(filtered_text, filtered_text[:-num_sum])
-    return directory
-
+    directory2 = directory + path
+    count_string = len(directory2)
+    if count_string > maximum_length:
+        num_sum = count_string - maximum_length
+        directory2 = directory2.replace(
+            filtered_text, filtered_text[:text_length])
+    count_string = len(directory2)
+    if count_string > maximum_length:
+        num_sum = count_string - maximum_length
+        directory2 = directory2.replace(
+            filtered_text, filtered_text[:-num_sum])
+        count_string = len(directory2)
+        if count_string > maximum_length:
+            directory2 = directory
+    count_string = len(directory2)
+    if count_string > maximum_length:
+        num_sum = count_string - maximum_length
+        directory2 = directory2.replace(
+            filtered_text, filtered_text[:50])
+        count_string = len(directory2)
+        if count_string > maximum_length:
+            directory2 = directory
+    return directory2
 
 
+def format_media_set(media_set):
+    x = {}
+    x["valid"] = []
+    x["invalid"] = []
+    for y in media_set:
+        x["valid"].extend(y[0])
+        x["invalid"].extend(y[1])
+    return x
diff --git a/modules/justforfans.py b/modules/justforfans.py
index c63d3e41f..506dd65ae 100644
--- a/modules/justforfans.py
+++ b/modules/justforfans.py
@@ -1,6 +1,8 @@
 import requests
 from bs4 import BeautifulSoup
 from win32_setctime import setctime
+from modules.helpers import reformat
+from modules.helpers import format_media_set
 
 import os
 import json
@@ -21,11 +23,17 @@
 json_global_settings = json_config["settings"]
 auto_choice = json_global_settings["auto_choice"]
 multithreading = json_global_settings["multithreading"]
-json_settings = json_config["supported"]["justforfans"]["settings"]
+json_settings = json_config["supported"]["onlyfans"]["settings"]
 j_directory = json_settings['directory'] + "/sites/"
 format_path = json_settings['file_name_format']
 overwrite_files = json_settings["overwrite_files"]
 date_format = json_settings["date_format"]
+ignored_keywords = json_settings["ignored_keywords"]
+maximum_length = 240
+text_length = int(json_settings["text_length"]
+                  ) if json_settings["text_length"] else maximum_length
+if text_length > maximum_length:
+    text_length = maximum_length
 
 max_threads = multiprocessing.cpu_count()
 
@@ -41,14 +49,14 @@ def start_datascraper(session, username, site_name, app_token=None):
         print("First time? Did you forget to edit your config.json file?")
         return [False]
 
-    post_count = int(user_id[2])
+    post_count = user_id[2]
     array = scrape_choice(username, post_count)
     link_array = {}
     for item in array:
         item[1].append(username)
-        only_links = item[1][3]
+        only_links = item[1][4]
         item[1].pop(3)
-        response = media_scraper(session, site_name, *item[1])
+        response = media_scraper(session, site_name, only_links, *item[1])
         link_array[item[1][1].lower()] = response[0]
         if not only_links:
             media_set = response[0]
@@ -57,10 +65,8 @@ def start_datascraper(session, username, site_name, app_token=None):
                 pool = ThreadPool(max_threads)
             else:
                 pool = ThreadPool(1)
-            pool.starmap(
-                download_media,
-                product(media_set, [session], [directory], [username]))
-
+            pool.starmap(download_media, product(
+                media_set["valid"], [session], [directory], [username]))
     # When profile is done scraping, this function will return True
     return [True, link_array]
 
@@ -78,8 +84,11 @@ def link_check(session, username):
     else:
         temp_user_id2[0] = True
         temp_user_id2[1] = str(username)
-        temp_user_id2[2] = BeautifulSoup(r.text, 'html.parser').find("div", {"class": "profile-info-value"}).find("h3")\
-            .get_text()
+        photo_count = int(BeautifulSoup(r.text, 'html.parser').findAll(
+            "div", {"class": "profile-info-value"})[2].find("h3").get_text())
+        video_count = int(BeautifulSoup(r.text, 'html.parser').findAll(
+            "div", {"class": "profile-info-value"})[1].find("h3").get_text())
+        temp_user_id2[2] = [photo_count, video_count]
         return temp_user_id2
 
 
@@ -91,22 +100,18 @@ def scrape_choice(username, post_count):
         print(
             'Optional Arguments: -l = Only scrape links -()- Example: "a -l"')
         input_choice = input().strip()
-    image_api = "https://justfor.fans/" + username + "?tab=photos&PhotoTabPage=0&VideoTabPage=9999"
-    video_api = "https://justfor.fans/" + username + "?tab=videos&PhotoTabPage=9999&VideoTabPage=0"
+    image_api = "https://justfor.fans/" + username + "?tab=photos&PhotoTabPage=0"
+    video_api = "https://justfor.fans/" + username + "?tab=videos&VideoTabPage=0"
     # ARGUMENTS
     only_links = False
     if "-l" in input_choice:
         only_links = True
         input_choice = input_choice.replace(" -l", "")
-    mandatory = [j_directory, only_links, post_count]
-    i_array = [
-        "You have chosen to scrape images", [image_api, 'Images', *mandatory],
-        'Images Completed'
-    ]
-    v_array = [
-        "You have chosen to scrape videos", [video_api, 'Videos', *mandatory],
-        'Videos Completed'
-    ]
+    mandatory = [j_directory, only_links]
+    i_array = ["You have chosen to scrape images", [
+        image_api, 'Images', "photo", *mandatory, post_count[0]], 'Images Completed']
+    v_array = ["You have chosen to scrape videos", [
+        video_api, 'Videos', "video", *mandatory, post_count[1]], 'Videos Completed']
     array = [i_array] + [v_array]
     valid_input = False
     if input_choice == "a":
@@ -124,128 +129,120 @@ def scrape_choice(username, post_count):
     return False
 
 
-def scrape_array(link, session):
-    media_set = []
-    utc_offset_timedelta = datetime.utcnow() - datetime.now()
+def scrape_array(link, session, media_type):
+    media_set = [[], []]
+    master_date = "00-00-0000"
     r = session.get(link)
-    i_items = BeautifulSoup(r.text,
-                            'html.parser').find("ul", {
-                                "class": "grid"
-                            }).findAll("li", {
-                                "class": None,
-                                "style": None
-                            })
-    v_items = BeautifulSoup(r.text, 'html.parser').findAll(
-        "div", {"class": "variableVideoLI"})
-    for x in i_items:
-        if x.find('figure').find('a') is not None:
-            img_src = x.find('figure').find('a').find('img')['src']
-            check = img_src[:5]
-            if check == u"media":
-                img_url = "https://justfor.fans/" + img_src
-            try:
-                data_src = x.find('figure').find('a').find('img')['data-src']
-                check = data_src[:5]
+    if r.status_code == 404:
+        return
+    utc_offset_timedelta = datetime.utcnow() - datetime.now()
+    if "photo" == media_type:
+        i_items = BeautifulSoup(r.text,
+                                'html.parser').find("ul", {
+                                    "class": "grid"
+                                }).findAll("li", {
+                                    "class": None,
+                                    "style": None
+                                })
+        for x in i_items:
+            if x.find('figure').find('a') is not None:
+                img_src = x.find('figure').find('a').find('img')['src']
+                check = img_src[:5]
                 if check == u"media":
-                    img_url = "https://justfor.fans/" + data_src
-            except KeyError:
-                pass
-            file = img_url
-            new_dict = dict()
-            new_dict["post_id"] = "https://justfor.fans/" + x.find(
-                'figure').find('a')['href']
-            new_dict["link"] = file
-            post_page = session.get(new_dict["post_id"]).text
-            new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1]
-            postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\
-                find('small').find('a').get_text().strip('\n')
-            local_datetime = datetime.strptime(postdate, "%B %d, %Y, %I:%M %p")
-            result_utc_datetime = local_datetime + utc_offset_timedelta
-            dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S")
-            post_text = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-post"}).\
-                find("div", {"class": "fr-view"}).get_text()
-            new_dict["text"] = re.sub(r'(\t[ ]+)', '',
-                                      post_text).replace('\n\t', '')
-            new_dict["postedAt"] = dt
-            media_set.append(new_dict)
-    for x in v_items:
-        if x.findAll('div') is not None:
-            file = x.find(
-                'div',
-                id=lambda y: y and y.startswith('videopage')).find('a')['href']
-            file = re.search(r"(https:\/\/autograph\.xvid\.com.+?)(?=')",
-                             file)[0].replace('&amp;', '&')
-            new_dict = dict()
-            new_dict["post_id"] = "https://justfor.fans/" + x.findAll(
-                'a')[-1]['href']
-            new_dict["link"] = file
-            post_page = session.get(new_dict["post_id"]).text
-            new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1]
-            postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\
-                find('small').find('a').get_text().strip('\n')
-            local_datetime = datetime.strptime(postdate, "%B %d, %Y, %I:%M %p")
-            result_utc_datetime = local_datetime + utc_offset_timedelta
-            dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S")
-            post_text = BeautifulSoup(post_page, 'html.parser').find(
-                "div", {
-                    "class": "timeline-item-post"
-                }).find("div", {
-                    "class": "fr-view"
-                }).get_text()
-            new_dict["text"] = re.sub(r'(\t[ ]*)', '',
-                                      post_text).replace('\n\t', '')
-            new_dict["postedAt"] = dt
-            media_set.append(new_dict)
+                    img_url = "https://justfor.fans/" + img_src
+                try:
+                    data_src = x.find('figure').find(
+                        'a').find('img')['data-src']
+                    check = data_src[:5]
+                    if check == u"media":
+                        img_url = "https://justfor.fans/" + data_src
+                except KeyError:
+                    pass
+                file = img_url
+                new_dict = dict()
+                new_dict["post_id"] = "https://justfor.fans/" + x.find(
+                    'figure').find('a')['href']
+                new_dict["link"] = file
+                post_page = session.get(new_dict["post_id"]).text
+                new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1]
+                postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\
+                    find('small').find('a').get_text().strip('\n')
+                local_datetime = datetime.strptime(
+                    postdate, "%B %d, %Y, %I:%M %p")
+                result_utc_datetime = local_datetime + utc_offset_timedelta
+                dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S")
+                post_text = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-post"}).\
+                    find("div", {"class": "fr-view"}).get_text()
+                new_dict["text"] = re.sub(r'(\t[ ]+)', '',
+                                          post_text).replace('\n\t', '')
+                new_dict["postedAt"] = dt
+                media_set[0].append(new_dict)
+    elif "video" == media_type:
+        v_items = BeautifulSoup(r.text, 'html.parser').findAll(
+            "div", {"class": "variableVideoLI"})
+        for x in v_items:
+            if x.findAll('div') is not None:
+                file = x.find(
+                    'div',
+                    id=lambda y: y and y.startswith('videopage')).find('a')['href']
+                file = re.search(r"(https:\/\/autograph\.xvid\.com.+?)(?=')",
+                                 file)[0].replace('&amp;', '&')
+                new_dict = dict()
+                new_dict["post_id"] = "https://justfor.fans/" + x.findAll(
+                    'a')[-1]['href']
+                new_dict["link"] = file
+                post_page = session.get(new_dict["post_id"]).text
+                new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1]
+                postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\
+                    find('small').find('a').get_text().strip('\n')
+                local_datetime = datetime.strptime(
+                    postdate, "%B %d, %Y, %I:%M %p")
+                result_utc_datetime = local_datetime + utc_offset_timedelta
+                dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S")
+                post_text = BeautifulSoup(post_page, 'html.parser').find(
+                    "div", {
+                        "class": "timeline-item-post"
+                    }).find("div", {
+                        "class": "fr-view"
+                    }).get_text()
+                new_dict["text"] = re.sub(r'(\t[ ]*)', '',
+                                          post_text).replace('\n\t', '')
+                new_dict["postedAt"] = dt
+                media_set[0].append(new_dict)
     return media_set
 
 
-def media_scraper(session, site_name, link, location, directory, post_count, username):
+def media_scraper(session, site_name, only_links, link, location, media_type, directory, post_count, username):
     print("Scraping " + location + ". May take a few minutes.")
     pool = ThreadPool(max_threads)
-    i = 0
+    ceil = math.ceil(post_count / 100)
+    a = list(range(ceil))
     offset_array = []
-    iter_link = link
-    page = session.get(iter_link)
-    items = BeautifulSoup(page.text,
-                          'html.parser').find("ul", {
-                              "class": "grid"
-                          }).findAll("li", {
-                              "class": None,
-                              "style": None
-                          })
-    items = items + BeautifulSoup(page.text, 'html.parser').findAll(
-        "div", {"class": "variableVideoLI"})
-    while len(items) > 0:
-        offset_array.append(iter_link)
-        i += 1
-        iter_link = link.replace("Page=0", "Page=" + str(i))
-        page = session.get(iter_link)
-        items = BeautifulSoup(page.text,
-                              'html.parser').find("ul", {
-                                  "class": "grid"
-                              }).findAll("li", {
-                                  "class": None,
-                                  "style": None
-                              })
-        items = items + BeautifulSoup(page.text, 'html.parser').findAll(
-            "div", {"class": "variableVideoLI"})
-    media_set = pool.starmap(scrape_array, product(offset_array, [session]))
-    media_set = [x for x in media_set if x is not None]
-    media_set = list(chain.from_iterable(media_set))
+    for b in a:
+        b = b * 100
+        offset_array.append(link.replace("Page=0", "Page=" + str(b)))
+    media_set = format_media_set(pool.starmap(scrape_array, product(
+        offset_array, [session], [media_type])))
     directory = j_directory
-    directory += "/"+site_name + "/"+username+"/"\
-                    + location+"/"
-    if "/sites/" == j_directory:
-        directory = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + directory
-    else:
-        directory = directory
+    if post_count:
+        user_directory = directory+"/"+site_name + "/"+username+"/"
+        metadata_directory = user_directory+"/metadata/"
+        directory = user_directory + location+"/"
+        if "/sites/" == j_directory:
+            user_directory = os.path.dirname(os.path.dirname(
+                os.path.realpath(__file__))) + user_directory
+            metadata_directory = os.path.dirname(os.path.dirname(
+                os.path.realpath(__file__))) + metadata_directory
+            directory = os.path.dirname(os.path.dirname(
+                os.path.realpath(__file__))) + directory
 
-    print("DIRECTORY - " + directory)
-    if not os.path.exists(directory):
-        os.makedirs(directory)
+        if not only_links:
+            print("DIRECTORY - " + directory)
+            os.makedirs(directory, exist_ok=True)
+        os.makedirs(metadata_directory, exist_ok=True)
 
-    with open(directory + 'links.json', 'w') as outfile:
-        json.dump(media_set, outfile)
+        with open(metadata_directory+location+".json", 'w') as outfile:
+            json.dump(media_set, outfile)
     return [media_set, directory]
 
 
@@ -280,7 +277,7 @@ def download_media(media, session, directory, username):
                 if chunk:  # filter out keep-alive new chunks
                     f.write(chunk)
         os_name = platform.system()
-        if os_name != "macOS":
+        if os_name == "Windows":
             setctime(directory, timestamp)
         print(link)
         return True
@@ -288,7 +285,8 @@ def download_media(media, session, directory, username):
 
 def reformat(directory2, file_name2, text, ext, date, username):
     path = format_path.replace("{username}", username)
-    text = BeautifulSoup(text, 'html.parser').get_text().replace("\n", " ").replace("\r", "").strip()
+    text = BeautifulSoup(text, 'html.parser').get_text().replace(
+        "\n", " ").replace("\r", "").strip()
     filtered_text = re.sub(r'[\\/*?:"<>|]', '', text)
     path = path.replace("{text}", filtered_text)
     date = date.strftime(date_format)
diff --git a/modules/onlyfans.py b/modules/onlyfans.py
index 4f194266f..e4bc6443b 100644
--- a/modules/onlyfans.py
+++ b/modules/onlyfans.py
@@ -1,6 +1,8 @@
 import requests
 from bs4 import BeautifulSoup
 from win32_setctime import setctime
+from modules.helpers import reformat
+from modules.helpers import format_media_set
 
 import os
 import json
@@ -26,13 +28,21 @@
 format_path = json_settings['file_name_format']
 overwrite_files = json_settings["overwrite_files"]
 date_format = json_settings["date_format"]
+ignored_keywords = json_settings["ignored_keywords"]
+maximum_length = 240
+text_length = int(json_settings["text_length"]
+                  ) if json_settings["text_length"] else maximum_length
+if text_length > maximum_length:
+    text_length = maximum_length
 
 max_threads = multiprocessing.cpu_count()
 
 
 def start_datascraper(session, username, site_name, app_token):
-    logging.basicConfig(filename='errors.log', level=logging.ERROR,
-                        format='%(asctime)s %(levelname)s %(name)s %(message)s')
+    logging.basicConfig(
+        filename='errors.log',
+        level=logging.ERROR,
+        format='%(asctime)s %(levelname)s %(name)s %(message)s')
     user_id = link_check(session, app_token, username)
     if not user_id[0]:
         print(user_id[1])
@@ -45,9 +55,9 @@ def start_datascraper(session, username, site_name, app_token):
     link_array = {}
     for item in array:
         item[1].append(username)
-        only_links = item[1][3]
+        only_links = item[1][4]
         item[1].pop(3)
-        response = media_scraper(session, site_name, *item[1])
+        response = media_scraper(session, site_name, only_links, *item[1])
         link_array[item[1][1].lower()] = response[0]
         if not only_links:
             media_set = response[0]
@@ -57,8 +67,7 @@ def start_datascraper(session, username, site_name, app_token):
             else:
                 pool = ThreadPool(1)
             pool.starmap(download_media, product(
-                media_set, [session], [directory], [username]))
-
+                media_set["valid"], [session], [directory], [username]))
     # When profile is done scraping, this function will return True
     return [True, link_array]
 
@@ -86,7 +95,8 @@ def link_check(session, app_token, username):
     else:
         temp_user_id2[0] = True
         temp_user_id2[1] = str(y["id"])
-        temp_user_id2[2] = y["postsCount"]
+        temp_user_id2[2] = [y["photosCount"],
+                            y["videosCount"], y["audiosCount"]]
         return temp_user_id2
 
 
@@ -94,24 +104,28 @@ def scrape_choice(user_id, app_token, post_count):
     if auto_choice:
         input_choice = auto_choice
     else:
-        print('Scrape: a = Everything | b = Images | c = Videos')
+        print('Scrape: a = Everything | b = Images | c = Videos | d = Audios')
         print('Optional Arguments: -l = Only scrape links -()- Example: "a -l"')
         input_choice = input().strip()
-    image_api = "https://onlyfans.com/api2/v2/users/"+user_id+"/posts/photos?limit=100&offset=0&order=publish_date_" \
-                                                              "desc&app-token="+app_token+""
-    video_api = "https://onlyfans.com/api2/v2/users/"+user_id+"/posts/videos?limit=100&offset=0&order=publish_date_" \
-                                                              "desc&app-token="+app_token+""
+    image_api = "https://onlyfans.com/api2/v2/users/"+user_id + \
+        "/posts/photos?limit=100&offset=0&order=publish_date_desc&app-token="+app_token+""
+    video_api = "https://onlyfans.com/api2/v2/users/"+user_id + \
+        "/posts/videos?limit=100&offset=0&order=publish_date_desc&app-token="+app_token+""
+    audio_api = "https://onlyfans.com/api2/v2/users/"+user_id + \
+        "/posts/audios?limit=100&offset=0&order=publish_date_desc&app-token="+app_token+""
     # ARGUMENTS
     only_links = False
     if "-l" in input_choice:
         only_links = True
         input_choice = input_choice.replace(" -l", "")
-    mandatory = [j_directory, only_links, post_count]
+    mandatory = [j_directory, only_links]
     i_array = ["You have chosen to scrape images", [
-        image_api, 'Images', *mandatory], 'Images Completed']
+        image_api, 'Images', "photo", *mandatory, post_count[0]], 'Images Completed']
     v_array = ["You have chosen to scrape videos", [
-        video_api, 'Videos', *mandatory], 'Videos Completed']
-    array = [i_array] + [v_array]
+        video_api, 'Videos', "video", *mandatory, post_count[1]], 'Videos Completed']
+    a_array = ["You have chosen to scrape audio", [
+        audio_api, 'Audios', "audio", *mandatory, post_count[2]], 'Audios Completed']
+    array = [i_array] + [v_array] + [a_array]
     valid_input = False
     if input_choice == "a":
         valid_input = True
@@ -121,6 +135,9 @@ def scrape_choice(user_id, app_token, post_count):
     if input_choice == "c":
         array = [array[1]]
         valid_input = True
+    if input_choice == "d":
+        array = [array[2]]
+        valid_input = True
     if valid_input:
         return array
     else:
@@ -128,19 +145,32 @@ def scrape_choice(user_id, app_token, post_count):
     return False
 
 
-def scrape_array(link, session):
-    media_set = []
+def scrape_array(link, session, media_type):
+    media_set = [[],[]]
     master_date = "00-00-0000"
-    r = session.get(link)
-    y = json.loads(r.text)
-    if not y:
-        return
+    count = 0
+    found = False
+    while count < 10:
+        r = session.get(link)
+        y = json.loads(r.text)
+        if not y:
+            count += 1
+            continue
+        found = True
+        break
+    if not found:
+        return media_set
+    x = 0
     for media_api in y:
         for media in media_api["media"]:
+            if media["type"] != media_type:
+                x += 1
+                continue
             if "source" in media:
-                file = media["source"]["source"]
+                source = media["source"]
+                file = source["source"]
                 if not file:
-                    return
+                    continue
                 if "ca2.convert" in file:
                     file = media["preview"]
                 new_dict = dict()
@@ -154,38 +184,45 @@ def scrape_array(link, session):
                     master_date = dt
                 new_dict["text"] = media_api["text"]
                 new_dict["postedAt"] = dt
-                media_set.append(new_dict)
+
+                if source["size"] == 0:
+                    media_set[1].append(new_dict)
+                    continue
+                media_set[0].append(new_dict)
     return media_set
 
 
-def media_scraper(session, site_name, link, location, directory, post_count, username):
+def media_scraper(session, site_name, only_links, link, location, media_type, directory, post_count, username):
     print("Scraping "+location+". Should take less than a minute.")
     pool = ThreadPool(max_threads)
-    floor = math.floor(post_count / 100)
-    if floor == 0:
-        floor = 1
-    a = list(range(floor))
+    ceil = math.ceil(post_count / 100)
+    a = list(range(ceil))
     offset_array = []
     for b in a:
         b = b * 100
         offset_array.append(link.replace("offset=0", "offset=" + str(b)))
-    media_set = pool.starmap(scrape_array, product(offset_array, [session]))
-    media_set = [x for x in media_set if x is not None]
-    media_set = list(chain.from_iterable(media_set))
+    media_set = format_media_set(pool.starmap(scrape_array, product(
+        offset_array, [session], [media_type])))
     directory = j_directory
-    directory += "/"+site_name + "/"+username+"/"\
-                    + location+"/"
-    if "/sites/" == j_directory:
-        directory = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + directory
-    else:
-        directory = directory
+    if post_count:
+        user_directory = directory+"/"+site_name + "/"+username+"/"
+        metadata_directory = user_directory+"/metadata/"
+        directory = user_directory + location+"/"
+        if "/sites/" == j_directory:
+            user_directory = os.path.dirname(os.path.dirname(
+                os.path.realpath(__file__))) + user_directory
+            metadata_directory = os.path.dirname(os.path.dirname(
+                os.path.realpath(__file__))) + metadata_directory
+            directory = os.path.dirname(os.path.dirname(
+                os.path.realpath(__file__))) + directory
 
-    print("DIRECTORY - " + directory)
-    if not os.path.exists(directory):
-        os.makedirs(directory)
+        if not only_links:
+            print("DIRECTORY - " + directory)
+            os.makedirs(directory, exist_ok=True)
+        os.makedirs(metadata_directory, exist_ok=True)
 
-    with open(directory+'links.json', 'w') as outfile:
-        json.dump(media_set, outfile)
+        with open(metadata_directory+location+".json", 'w') as outfile:
+            json.dump(media_set, outfile)
     return [media_set, directory]
 
 
@@ -193,8 +230,6 @@ def download_media(media, session, directory, username):
     while True:
         link = media["link"]
         r = session.head(link)
-        if r.status_code != 200:
-            return
         file_name = link.rsplit('/', 1)[-1]
         result = file_name.split("_", 1)
         if len(result) > 1:
@@ -206,7 +241,7 @@ def download_media(media, session, directory, username):
         ext = ext.replace(".", "")
         date_object = datetime.strptime(media["postedAt"], "%d-%m-%Y %H:%M:%S")
         directory = reformat(directory, file_name,
-                             media["text"], ext, date_object, username)
+                             media["text"], ext, date_object, username, format_path, date_format, text_length, maximum_length)
         timestamp = date_object.timestamp()
         if not overwrite_files:
             if os.path.isfile(directory):
@@ -219,32 +254,12 @@ def download_media(media, session, directory, username):
                 if chunk:  # filter out keep-alive new chunks
                     f.write(chunk)
         os_name = platform.system()
-        if os_name != "macOS":
+        if os_name == "Windows":
             setctime(directory, timestamp)
         print(link)
         return True
 
 
-def reformat(directory2, file_name2, text, ext, date, username):
-    path = format_path.replace("{username}", username)
-    text = BeautifulSoup(text, 'html.parser').get_text().replace(
-        "\n", " ").strip()
-    filtered_text = re.sub(r'[\\/*?:"<>|]', '', text)
-    path = path.replace("{text}", filtered_text)
-    date = date.strftime(date_format)
-    path = path.replace("{date}", date)
-    path = path.replace("{file_name}", file_name2)
-    path = path.replace("{ext}", ext)
-    directory2 += path
-    count_string = len(directory2)
-    if count_string > 259:
-        num_sum = count_string - 259
-        directory2 = directory2.replace(
-            filtered_text, filtered_text[:-num_sum])
-
-    return directory2
-
-
 def show_error(error):
     print(error["error"]["message"])
     return