From 69d83b4db57be311d97a4c89b29fb8bc0ccb93f5 Mon Sep 17 00:00:00 2001 From: CRIMINAL Date: Sun, 20 Oct 2019 19:23:39 +0100 Subject: [PATCH] Fixes and Features update Fixes: If your OS is not Windows, the script will not use any Windows functions Script will now grab all media unless they are locked behind a paywall or duplicated. If there is any corrupt media, it will be exported to archive.json. All media will download to their proper folders. (No more images in videos and vice-versa) Features: metadata/archive.json will now contain valid and invalid posts. (This will replace links.json) Config Update: ignored_keywords- Any posts containing these words will be ignored. text_length- When using the {text} in file_name_format, you can set a maximum length. boards- Input any boards you'd like to automatically scrape. --- README.md | 22 ++++ Start Datascraper.py | 18 ++- config.json | 12 +- modules/four_chan.py | 156 +++++++++++++++++++----- modules/helpers.py | 46 +++++-- modules/justforfans.py | 266 ++++++++++++++++++++--------------------- modules/onlyfans.py | 151 ++++++++++++----------- 7 files changed, 417 insertions(+), 254 deletions(-) diff --git a/README.md b/README.md index 60b5897e6..aeeb78d03 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,14 @@ file_name_format: Example: {date}/{text}-{file_name}.{ext} Warning: It's important to keep a unique identifier next to .{ext}. By default it's {file_name}, but it can be {date}-{text}.ext +text_length: + + Default = "" + Ideal = "50" + Max = "259" + + When you use {text} in file_name_format, a limit of how many characters can be set by inputting a number. + auto_site_choice: Default = "" @@ -101,6 +109,20 @@ multithreading: If set to false, you will download files 1 by 1. (If you don't have fast internet, may god help you.) I'd reccomend leaving it set to true. +boards: + + Default = [] + Example = ["s", "gif"] + + Input boards names that you want to automatically scrape. + +ignored_keywords: + + Default = [] + Example = ["ignore", "me"] + + Any words you input, the script will ignore any content that contains these words. + # OPTIONAL ARGUMENTS diff --git a/Start Datascraper.py b/Start Datascraper.py index a0c8219ec..3aba539d7 100644 --- a/Start Datascraper.py +++ b/Start Datascraper.py @@ -34,6 +34,7 @@ x = int(input()) site_name = site_names[x] json_auth = json_sites[site_name]["auth"] + json_site_settings = json_sites[site_name]["settings"] session = "" x = "" app_token = "" @@ -43,22 +44,27 @@ auth_hash = json_auth['auth_hash'] x = onlyfans session = x.create_session(user_agent, auth_id, auth_hash, app_token) + array = [] elif site_name == "justforfans": auth_id = json_auth['phpsessid'] auth_hash = json_auth['user_hash2'] x = justforfans session = x.create_session(user_agent, auth_id, auth_hash) + array = [] elif site_name == "4chan": x = four_chan session = x.create_session() + array = json_site_settings["boards"] if not session[0]: continue print('Input a '+site_name+' '+session[1]) - input_link = input().strip() - username = helpers.parse_links(site_name, input_link) - start_time = timeit.default_timer() session = session[0] - result = x.start_datascraper(session, username, site_name, app_token) - stop_time = str(int(timeit.default_timer() - start_time) / 60) - print('Task Completed in ' + stop_time + ' Minutes') + if not array: + array = [input().strip()] + for input_link in array: + username = helpers.parse_links(site_name, input_link) + start_time = timeit.default_timer() + result = x.start_datascraper(session, username, site_name, app_token) + stop_time = str(int(timeit.default_timer() - start_time) / 60) + print('Task Completed in ' + stop_time + ' Minutes') diff --git a/config.json b/config.json index b650f39bb..66fc95522 100644 --- a/config.json +++ b/config.json @@ -15,8 +15,10 @@ "settings": { "directory": "", "file_name_format": "{file_name}.{ext}", + "text_length": "", "overwrite_files": true, - "date_format": "%d-%m-%Y" + "date_format": "%d-%m-%Y", + "ignored_keywords": [] } }, "justforfans": { @@ -27,8 +29,10 @@ "settings": { "directory": "", "file_name_format": "{file_name}.{ext}", + "text_length": "", "overwrite_files": true, - "date_format": "%d-%m-%Y" + "date_format": "%d-%m-%Y", + "ignored_keywords": [] } }, "4chan": { @@ -36,9 +40,11 @@ "settings": { "directory": "", "file_name_format": "{file_name}.{ext}", + "text_length": "", "overwrite_files": false, "date_format": "%d-%m-%Y", - "ignore_thread_titles": [""] + "boards": [], + "ignored_keywords": [] } } diff --git a/modules/four_chan.py b/modules/four_chan.py index becf579ae..016f8c351 100644 --- a/modules/four_chan.py +++ b/modules/four_chan.py @@ -27,6 +27,12 @@ format_path = json_settings['file_name_format'] overwrite_files = json_settings["overwrite_files"] date_format = json_settings["date_format"] +ignored_keywords = json_settings["ignored_keywords"] +maximum_length = 240 +text_length = int(json_settings["text_length"] + ) if json_settings["text_length"] else maximum_length +if text_length > maximum_length: + text_length = maximum_length max_threads = multiprocessing.cpu_count() @@ -41,6 +47,7 @@ def start_datascraper(session, board_name, site_name, link_type=None): print(user_id[1]) print("First time? Did you forget to edit your config.json file?") return [False] + print("Board: " + board_name) array = scrape_choice(board_name) link_array = {} if multithreading: @@ -50,9 +57,7 @@ def start_datascraper(session, board_name, site_name, link_type=None): threads = board_scraper(session, array[0], "") archive_threads = board_scraper(session, array[1], "archive") threads = threads + archive_threads - print("Scraping Threads") - threads = pool.starmap(thread_scraper, - product(threads, [board_name], [session])) + print("Original Count: "+str(len(threads))) directory = j_directory directory += "/"+site_name + "/" + board_name + "/" if "/sites/" == j_directory: @@ -62,10 +67,16 @@ def start_datascraper(session, board_name, site_name, link_type=None): else: directory = directory + print("Scraping Threads") + threads = pool.starmap(thread_scraper, + product(threads, [board_name], [session], [directory])) + threads = [x for x in threads if x is not None] + print("Filtered Count: "+str(len(threads))) print("Downloading Media") - pool.starmap(download_media, - product(threads, [session], [directory], [board_name])) - + results = pool.starmap(download_media, + product(threads, [session], [directory], [board_name])) + count_results = str(len([x for x in threads if x is None])) + print("Valid Count: "+count_results) # When profile is done scraping, this function will return True return [True, link_array] @@ -103,49 +114,128 @@ def board_scraper(session, link, category): return threads -def thread_scraper(thread_id, board_name, session): - link = "http://a.4cdn.org/" + board_name + "/thread/" + str( - thread_id) + ".json" +def thread_scraper(thread_id, board_name, session, directory): + thread_id = str(thread_id) + link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json" r = session.get(link) - y = json.loads(r.text) - return y - + if r.status_code == 404: + return + try: + thread = json.loads(r.text) + thread_master = thread["posts"][0] + except Exception as e: + print(e, link) + return + if "archived" in thread_master: + location = "Archive" + else: + location = "Catalog" -def download_media(thread, session, directory, board_name): - thread_master = thread["posts"][0] - thread_id = str(thread_master["no"]) + if "sub" in thread_master: + title = thread_master["sub"].lower() + if any(ignored_keyword in title for ignored_keyword in ignored_keywords): + print("Removed From "+location+": ", title) + return + + if "com" in thread_master: + title = thread_master["com"].lower() + if any(ignored_keyword in title for ignored_keyword in ignored_keywords): + print("Removed From "+location+": ", title) + return text = "" if "sub" in thread_master: - text = thread_master["sub"] + text = thread_master["sub"][:text_length] else: - if "com" in thread_master: - text = thread_master["com"] + text = thread_master["com"][:text_length] text = BeautifulSoup(text, 'html.parser').get_text().replace( "\n", " ").strip() text = re.sub(r'[\\/*?:"<>|]', '', text) + thread["download_path"] = "" for post in thread["posts"]: + if "name" not in post: + post["name"] = "Anonymous" if "filename" in post: filename = str(post["tim"]) ext = post["ext"].replace(".", "") - link = "http://i.4cdn.org/" + board_name + "/" + filename+"."+ext + filename = post["filename"] new_directory = directory+"/"+text+" - "+thread_id+"/" if not text: new_directory = new_directory.replace(" - ", "") + date_object = datetime.fromtimestamp(post["time"]) - new_directory = reformat(new_directory, filename, text, ext, date_object, post["name"], format_path, - date_format) - if not overwrite_files: - if os.path.isfile(new_directory): - continue - r = session.get(link, stream=True) - if r.status_code != 404: - if not os.path.exists(os.path.dirname(new_directory)): - os.makedirs(os.path.dirname(new_directory)) - with open(new_directory, 'wb') as f: - for chunk in r.iter_content(chunk_size=1024): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - print(link, new_directory) + og_filename = filename + download_path = os.path.dirname(reformat( + new_directory, filename, text, ext, date_object, post["name"], format_path, date_format, text_length, maximum_length)) + size = len(download_path) + size2 = len(thread["download_path"]) + if thread["download_path"]: + if len(download_path) < len(thread["download_path"]): + thread["download_path"] = download_path + else: + thread["download_path"] = download_path + return thread + + +def download_media(thread, session, directory, board_name): + try: + directory = thread["download_path"]+"/" + valid = False + for post in thread["posts"]: + if "filename" in post: + post["filename"] = re.sub( + r'[\\/*?:"<>|]', '', post["filename"]) + ext = post["ext"].replace(".", "") + filename = str(post["tim"])+"."+ext + link = "http://i.4cdn.org/" + board_name + "/" + filename + filename = post["filename"]+"."+ext + download_path = directory+filename + count_string = len(download_path) + if count_string > 259: + num_sum = count_string - 259 + post["filename"] = post["filename"][:50] + download_path = directory+post["filename"]+"."+ext + + if not overwrite_files: + count = 1 + found = False + og_filename = post["filename"] + while True: + if os.path.isfile(download_path): + remote_size = post["fsize"] + local_size = os.path.getsize(download_path) + if remote_size == local_size: + found = True + break + else: + download_path = directory+og_filename + \ + " ("+str(count)+")."+ext + count += 1 + continue + else: + found = False + break + if found: + continue + r = session.get(link, stream=True) + if r.status_code != 404: + if not os.path.exists(os.path.dirname(download_path)): + os.makedirs(os.path.dirname(download_path)) + with open(download_path, 'wb') as f: + for chunk in r.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + print(download_path) + valid = True + if valid: + os.makedirs(directory, exist_ok=True) + with open(directory+'archive.json', 'w') as outfile: + json.dump(thread, outfile) + return thread + else: + return + except Exception as e: + print("ERROR", e, directory) + return def create_session(): diff --git a/modules/helpers.py b/modules/helpers.py index 4630303b1..280421f65 100644 --- a/modules/helpers.py +++ b/modules/helpers.py @@ -1,5 +1,6 @@ -from bs4 import BeautifulSoup import re +import os +from bs4 import BeautifulSoup def parse_links(site_name, input_link): @@ -19,21 +20,46 @@ def parse_links(site_name, input_link): return input_link -def reformat(directory, file_name, text, ext, date, username, format_path, date_format): +def reformat(directory, file_name, text, ext, date, username, format_path, date_format, text_length, maximum_length): path = format_path.replace("{username}", username) - text = BeautifulSoup(text, 'html.parser').get_text().replace("\n", " ").strip() + text = BeautifulSoup(text, 'html.parser').get_text().replace( + "\n", " ").strip() filtered_text = re.sub(r'[\\/*?:"<>|]', '', text) path = path.replace("{text}", filtered_text) date = date.strftime(date_format) path = path.replace("{date}", date) path = path.replace("{file_name}", file_name) path = path.replace("{ext}", ext) - directory += path - count_string = len(directory) - if count_string > 259: - num_sum = count_string - 259 - directory = directory.replace(filtered_text, filtered_text[:-num_sum]) - return directory - + directory2 = directory + path + count_string = len(directory2) + if count_string > maximum_length: + num_sum = count_string - maximum_length + directory2 = directory2.replace( + filtered_text, filtered_text[:text_length]) + count_string = len(directory2) + if count_string > maximum_length: + num_sum = count_string - maximum_length + directory2 = directory2.replace( + filtered_text, filtered_text[:-num_sum]) + count_string = len(directory2) + if count_string > maximum_length: + directory2 = directory + count_string = len(directory2) + if count_string > maximum_length: + num_sum = count_string - maximum_length + directory2 = directory2.replace( + filtered_text, filtered_text[:50]) + count_string = len(directory2) + if count_string > maximum_length: + directory2 = directory + return directory2 +def format_media_set(media_set): + x = {} + x["valid"] = [] + x["invalid"] = [] + for y in media_set: + x["valid"].extend(y[0]) + x["invalid"].extend(y[1]) + return x diff --git a/modules/justforfans.py b/modules/justforfans.py index c63d3e41f..506dd65ae 100644 --- a/modules/justforfans.py +++ b/modules/justforfans.py @@ -1,6 +1,8 @@ import requests from bs4 import BeautifulSoup from win32_setctime import setctime +from modules.helpers import reformat +from modules.helpers import format_media_set import os import json @@ -21,11 +23,17 @@ json_global_settings = json_config["settings"] auto_choice = json_global_settings["auto_choice"] multithreading = json_global_settings["multithreading"] -json_settings = json_config["supported"]["justforfans"]["settings"] +json_settings = json_config["supported"]["onlyfans"]["settings"] j_directory = json_settings['directory'] + "/sites/" format_path = json_settings['file_name_format'] overwrite_files = json_settings["overwrite_files"] date_format = json_settings["date_format"] +ignored_keywords = json_settings["ignored_keywords"] +maximum_length = 240 +text_length = int(json_settings["text_length"] + ) if json_settings["text_length"] else maximum_length +if text_length > maximum_length: + text_length = maximum_length max_threads = multiprocessing.cpu_count() @@ -41,14 +49,14 @@ def start_datascraper(session, username, site_name, app_token=None): print("First time? Did you forget to edit your config.json file?") return [False] - post_count = int(user_id[2]) + post_count = user_id[2] array = scrape_choice(username, post_count) link_array = {} for item in array: item[1].append(username) - only_links = item[1][3] + only_links = item[1][4] item[1].pop(3) - response = media_scraper(session, site_name, *item[1]) + response = media_scraper(session, site_name, only_links, *item[1]) link_array[item[1][1].lower()] = response[0] if not only_links: media_set = response[0] @@ -57,10 +65,8 @@ def start_datascraper(session, username, site_name, app_token=None): pool = ThreadPool(max_threads) else: pool = ThreadPool(1) - pool.starmap( - download_media, - product(media_set, [session], [directory], [username])) - + pool.starmap(download_media, product( + media_set["valid"], [session], [directory], [username])) # When profile is done scraping, this function will return True return [True, link_array] @@ -78,8 +84,11 @@ def link_check(session, username): else: temp_user_id2[0] = True temp_user_id2[1] = str(username) - temp_user_id2[2] = BeautifulSoup(r.text, 'html.parser').find("div", {"class": "profile-info-value"}).find("h3")\ - .get_text() + photo_count = int(BeautifulSoup(r.text, 'html.parser').findAll( + "div", {"class": "profile-info-value"})[2].find("h3").get_text()) + video_count = int(BeautifulSoup(r.text, 'html.parser').findAll( + "div", {"class": "profile-info-value"})[1].find("h3").get_text()) + temp_user_id2[2] = [photo_count, video_count] return temp_user_id2 @@ -91,22 +100,18 @@ def scrape_choice(username, post_count): print( 'Optional Arguments: -l = Only scrape links -()- Example: "a -l"') input_choice = input().strip() - image_api = "https://justfor.fans/" + username + "?tab=photos&PhotoTabPage=0&VideoTabPage=9999" - video_api = "https://justfor.fans/" + username + "?tab=videos&PhotoTabPage=9999&VideoTabPage=0" + image_api = "https://justfor.fans/" + username + "?tab=photos&PhotoTabPage=0" + video_api = "https://justfor.fans/" + username + "?tab=videos&VideoTabPage=0" # ARGUMENTS only_links = False if "-l" in input_choice: only_links = True input_choice = input_choice.replace(" -l", "") - mandatory = [j_directory, only_links, post_count] - i_array = [ - "You have chosen to scrape images", [image_api, 'Images', *mandatory], - 'Images Completed' - ] - v_array = [ - "You have chosen to scrape videos", [video_api, 'Videos', *mandatory], - 'Videos Completed' - ] + mandatory = [j_directory, only_links] + i_array = ["You have chosen to scrape images", [ + image_api, 'Images', "photo", *mandatory, post_count[0]], 'Images Completed'] + v_array = ["You have chosen to scrape videos", [ + video_api, 'Videos', "video", *mandatory, post_count[1]], 'Videos Completed'] array = [i_array] + [v_array] valid_input = False if input_choice == "a": @@ -124,128 +129,120 @@ def scrape_choice(username, post_count): return False -def scrape_array(link, session): - media_set = [] - utc_offset_timedelta = datetime.utcnow() - datetime.now() +def scrape_array(link, session, media_type): + media_set = [[], []] + master_date = "00-00-0000" r = session.get(link) - i_items = BeautifulSoup(r.text, - 'html.parser').find("ul", { - "class": "grid" - }).findAll("li", { - "class": None, - "style": None - }) - v_items = BeautifulSoup(r.text, 'html.parser').findAll( - "div", {"class": "variableVideoLI"}) - for x in i_items: - if x.find('figure').find('a') is not None: - img_src = x.find('figure').find('a').find('img')['src'] - check = img_src[:5] - if check == u"media": - img_url = "https://justfor.fans/" + img_src - try: - data_src = x.find('figure').find('a').find('img')['data-src'] - check = data_src[:5] + if r.status_code == 404: + return + utc_offset_timedelta = datetime.utcnow() - datetime.now() + if "photo" == media_type: + i_items = BeautifulSoup(r.text, + 'html.parser').find("ul", { + "class": "grid" + }).findAll("li", { + "class": None, + "style": None + }) + for x in i_items: + if x.find('figure').find('a') is not None: + img_src = x.find('figure').find('a').find('img')['src'] + check = img_src[:5] if check == u"media": - img_url = "https://justfor.fans/" + data_src - except KeyError: - pass - file = img_url - new_dict = dict() - new_dict["post_id"] = "https://justfor.fans/" + x.find( - 'figure').find('a')['href'] - new_dict["link"] = file - post_page = session.get(new_dict["post_id"]).text - new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1] - postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\ - find('small').find('a').get_text().strip('\n') - local_datetime = datetime.strptime(postdate, "%B %d, %Y, %I:%M %p") - result_utc_datetime = local_datetime + utc_offset_timedelta - dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S") - post_text = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-post"}).\ - find("div", {"class": "fr-view"}).get_text() - new_dict["text"] = re.sub(r'(\t[ ]+)', '', - post_text).replace('\n\t', '') - new_dict["postedAt"] = dt - media_set.append(new_dict) - for x in v_items: - if x.findAll('div') is not None: - file = x.find( - 'div', - id=lambda y: y and y.startswith('videopage')).find('a')['href'] - file = re.search(r"(https:\/\/autograph\.xvid\.com.+?)(?=')", - file)[0].replace('&', '&') - new_dict = dict() - new_dict["post_id"] = "https://justfor.fans/" + x.findAll( - 'a')[-1]['href'] - new_dict["link"] = file - post_page = session.get(new_dict["post_id"]).text - new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1] - postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\ - find('small').find('a').get_text().strip('\n') - local_datetime = datetime.strptime(postdate, "%B %d, %Y, %I:%M %p") - result_utc_datetime = local_datetime + utc_offset_timedelta - dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S") - post_text = BeautifulSoup(post_page, 'html.parser').find( - "div", { - "class": "timeline-item-post" - }).find("div", { - "class": "fr-view" - }).get_text() - new_dict["text"] = re.sub(r'(\t[ ]*)', '', - post_text).replace('\n\t', '') - new_dict["postedAt"] = dt - media_set.append(new_dict) + img_url = "https://justfor.fans/" + img_src + try: + data_src = x.find('figure').find( + 'a').find('img')['data-src'] + check = data_src[:5] + if check == u"media": + img_url = "https://justfor.fans/" + data_src + except KeyError: + pass + file = img_url + new_dict = dict() + new_dict["post_id"] = "https://justfor.fans/" + x.find( + 'figure').find('a')['href'] + new_dict["link"] = file + post_page = session.get(new_dict["post_id"]).text + new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1] + postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\ + find('small').find('a').get_text().strip('\n') + local_datetime = datetime.strptime( + postdate, "%B %d, %Y, %I:%M %p") + result_utc_datetime = local_datetime + utc_offset_timedelta + dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S") + post_text = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-post"}).\ + find("div", {"class": "fr-view"}).get_text() + new_dict["text"] = re.sub(r'(\t[ ]+)', '', + post_text).replace('\n\t', '') + new_dict["postedAt"] = dt + media_set[0].append(new_dict) + elif "video" == media_type: + v_items = BeautifulSoup(r.text, 'html.parser').findAll( + "div", {"class": "variableVideoLI"}) + for x in v_items: + if x.findAll('div') is not None: + file = x.find( + 'div', + id=lambda y: y and y.startswith('videopage')).find('a')['href'] + file = re.search(r"(https:\/\/autograph\.xvid\.com.+?)(?=')", + file)[0].replace('&', '&') + new_dict = dict() + new_dict["post_id"] = "https://justfor.fans/" + x.findAll( + 'a')[-1]['href'] + new_dict["link"] = file + post_page = session.get(new_dict["post_id"]).text + new_dict["post_id"] = new_dict["post_id"].rsplit('=')[-1] + postdate = BeautifulSoup(post_page, 'html.parser').find("div", {"class": "timeline-item-header"}).\ + find('small').find('a').get_text().strip('\n') + local_datetime = datetime.strptime( + postdate, "%B %d, %Y, %I:%M %p") + result_utc_datetime = local_datetime + utc_offset_timedelta + dt = result_utc_datetime.strftime("%d-%m-%Y %H:%M:%S") + post_text = BeautifulSoup(post_page, 'html.parser').find( + "div", { + "class": "timeline-item-post" + }).find("div", { + "class": "fr-view" + }).get_text() + new_dict["text"] = re.sub(r'(\t[ ]*)', '', + post_text).replace('\n\t', '') + new_dict["postedAt"] = dt + media_set[0].append(new_dict) return media_set -def media_scraper(session, site_name, link, location, directory, post_count, username): +def media_scraper(session, site_name, only_links, link, location, media_type, directory, post_count, username): print("Scraping " + location + ". May take a few minutes.") pool = ThreadPool(max_threads) - i = 0 + ceil = math.ceil(post_count / 100) + a = list(range(ceil)) offset_array = [] - iter_link = link - page = session.get(iter_link) - items = BeautifulSoup(page.text, - 'html.parser').find("ul", { - "class": "grid" - }).findAll("li", { - "class": None, - "style": None - }) - items = items + BeautifulSoup(page.text, 'html.parser').findAll( - "div", {"class": "variableVideoLI"}) - while len(items) > 0: - offset_array.append(iter_link) - i += 1 - iter_link = link.replace("Page=0", "Page=" + str(i)) - page = session.get(iter_link) - items = BeautifulSoup(page.text, - 'html.parser').find("ul", { - "class": "grid" - }).findAll("li", { - "class": None, - "style": None - }) - items = items + BeautifulSoup(page.text, 'html.parser').findAll( - "div", {"class": "variableVideoLI"}) - media_set = pool.starmap(scrape_array, product(offset_array, [session])) - media_set = [x for x in media_set if x is not None] - media_set = list(chain.from_iterable(media_set)) + for b in a: + b = b * 100 + offset_array.append(link.replace("Page=0", "Page=" + str(b))) + media_set = format_media_set(pool.starmap(scrape_array, product( + offset_array, [session], [media_type]))) directory = j_directory - directory += "/"+site_name + "/"+username+"/"\ - + location+"/" - if "/sites/" == j_directory: - directory = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + directory - else: - directory = directory + if post_count: + user_directory = directory+"/"+site_name + "/"+username+"/" + metadata_directory = user_directory+"/metadata/" + directory = user_directory + location+"/" + if "/sites/" == j_directory: + user_directory = os.path.dirname(os.path.dirname( + os.path.realpath(__file__))) + user_directory + metadata_directory = os.path.dirname(os.path.dirname( + os.path.realpath(__file__))) + metadata_directory + directory = os.path.dirname(os.path.dirname( + os.path.realpath(__file__))) + directory - print("DIRECTORY - " + directory) - if not os.path.exists(directory): - os.makedirs(directory) + if not only_links: + print("DIRECTORY - " + directory) + os.makedirs(directory, exist_ok=True) + os.makedirs(metadata_directory, exist_ok=True) - with open(directory + 'links.json', 'w') as outfile: - json.dump(media_set, outfile) + with open(metadata_directory+location+".json", 'w') as outfile: + json.dump(media_set, outfile) return [media_set, directory] @@ -280,7 +277,7 @@ def download_media(media, session, directory, username): if chunk: # filter out keep-alive new chunks f.write(chunk) os_name = platform.system() - if os_name != "macOS": + if os_name == "Windows": setctime(directory, timestamp) print(link) return True @@ -288,7 +285,8 @@ def download_media(media, session, directory, username): def reformat(directory2, file_name2, text, ext, date, username): path = format_path.replace("{username}", username) - text = BeautifulSoup(text, 'html.parser').get_text().replace("\n", " ").replace("\r", "").strip() + text = BeautifulSoup(text, 'html.parser').get_text().replace( + "\n", " ").replace("\r", "").strip() filtered_text = re.sub(r'[\\/*?:"<>|]', '', text) path = path.replace("{text}", filtered_text) date = date.strftime(date_format) diff --git a/modules/onlyfans.py b/modules/onlyfans.py index 4f194266f..e4bc6443b 100644 --- a/modules/onlyfans.py +++ b/modules/onlyfans.py @@ -1,6 +1,8 @@ import requests from bs4 import BeautifulSoup from win32_setctime import setctime +from modules.helpers import reformat +from modules.helpers import format_media_set import os import json @@ -26,13 +28,21 @@ format_path = json_settings['file_name_format'] overwrite_files = json_settings["overwrite_files"] date_format = json_settings["date_format"] +ignored_keywords = json_settings["ignored_keywords"] +maximum_length = 240 +text_length = int(json_settings["text_length"] + ) if json_settings["text_length"] else maximum_length +if text_length > maximum_length: + text_length = maximum_length max_threads = multiprocessing.cpu_count() def start_datascraper(session, username, site_name, app_token): - logging.basicConfig(filename='errors.log', level=logging.ERROR, - format='%(asctime)s %(levelname)s %(name)s %(message)s') + logging.basicConfig( + filename='errors.log', + level=logging.ERROR, + format='%(asctime)s %(levelname)s %(name)s %(message)s') user_id = link_check(session, app_token, username) if not user_id[0]: print(user_id[1]) @@ -45,9 +55,9 @@ def start_datascraper(session, username, site_name, app_token): link_array = {} for item in array: item[1].append(username) - only_links = item[1][3] + only_links = item[1][4] item[1].pop(3) - response = media_scraper(session, site_name, *item[1]) + response = media_scraper(session, site_name, only_links, *item[1]) link_array[item[1][1].lower()] = response[0] if not only_links: media_set = response[0] @@ -57,8 +67,7 @@ def start_datascraper(session, username, site_name, app_token): else: pool = ThreadPool(1) pool.starmap(download_media, product( - media_set, [session], [directory], [username])) - + media_set["valid"], [session], [directory], [username])) # When profile is done scraping, this function will return True return [True, link_array] @@ -86,7 +95,8 @@ def link_check(session, app_token, username): else: temp_user_id2[0] = True temp_user_id2[1] = str(y["id"]) - temp_user_id2[2] = y["postsCount"] + temp_user_id2[2] = [y["photosCount"], + y["videosCount"], y["audiosCount"]] return temp_user_id2 @@ -94,24 +104,28 @@ def scrape_choice(user_id, app_token, post_count): if auto_choice: input_choice = auto_choice else: - print('Scrape: a = Everything | b = Images | c = Videos') + print('Scrape: a = Everything | b = Images | c = Videos | d = Audios') print('Optional Arguments: -l = Only scrape links -()- Example: "a -l"') input_choice = input().strip() - image_api = "https://onlyfans.com/api2/v2/users/"+user_id+"/posts/photos?limit=100&offset=0&order=publish_date_" \ - "desc&app-token="+app_token+"" - video_api = "https://onlyfans.com/api2/v2/users/"+user_id+"/posts/videos?limit=100&offset=0&order=publish_date_" \ - "desc&app-token="+app_token+"" + image_api = "https://onlyfans.com/api2/v2/users/"+user_id + \ + "/posts/photos?limit=100&offset=0&order=publish_date_desc&app-token="+app_token+"" + video_api = "https://onlyfans.com/api2/v2/users/"+user_id + \ + "/posts/videos?limit=100&offset=0&order=publish_date_desc&app-token="+app_token+"" + audio_api = "https://onlyfans.com/api2/v2/users/"+user_id + \ + "/posts/audios?limit=100&offset=0&order=publish_date_desc&app-token="+app_token+"" # ARGUMENTS only_links = False if "-l" in input_choice: only_links = True input_choice = input_choice.replace(" -l", "") - mandatory = [j_directory, only_links, post_count] + mandatory = [j_directory, only_links] i_array = ["You have chosen to scrape images", [ - image_api, 'Images', *mandatory], 'Images Completed'] + image_api, 'Images', "photo", *mandatory, post_count[0]], 'Images Completed'] v_array = ["You have chosen to scrape videos", [ - video_api, 'Videos', *mandatory], 'Videos Completed'] - array = [i_array] + [v_array] + video_api, 'Videos', "video", *mandatory, post_count[1]], 'Videos Completed'] + a_array = ["You have chosen to scrape audio", [ + audio_api, 'Audios', "audio", *mandatory, post_count[2]], 'Audios Completed'] + array = [i_array] + [v_array] + [a_array] valid_input = False if input_choice == "a": valid_input = True @@ -121,6 +135,9 @@ def scrape_choice(user_id, app_token, post_count): if input_choice == "c": array = [array[1]] valid_input = True + if input_choice == "d": + array = [array[2]] + valid_input = True if valid_input: return array else: @@ -128,19 +145,32 @@ def scrape_choice(user_id, app_token, post_count): return False -def scrape_array(link, session): - media_set = [] +def scrape_array(link, session, media_type): + media_set = [[],[]] master_date = "00-00-0000" - r = session.get(link) - y = json.loads(r.text) - if not y: - return + count = 0 + found = False + while count < 10: + r = session.get(link) + y = json.loads(r.text) + if not y: + count += 1 + continue + found = True + break + if not found: + return media_set + x = 0 for media_api in y: for media in media_api["media"]: + if media["type"] != media_type: + x += 1 + continue if "source" in media: - file = media["source"]["source"] + source = media["source"] + file = source["source"] if not file: - return + continue if "ca2.convert" in file: file = media["preview"] new_dict = dict() @@ -154,38 +184,45 @@ def scrape_array(link, session): master_date = dt new_dict["text"] = media_api["text"] new_dict["postedAt"] = dt - media_set.append(new_dict) + + if source["size"] == 0: + media_set[1].append(new_dict) + continue + media_set[0].append(new_dict) return media_set -def media_scraper(session, site_name, link, location, directory, post_count, username): +def media_scraper(session, site_name, only_links, link, location, media_type, directory, post_count, username): print("Scraping "+location+". Should take less than a minute.") pool = ThreadPool(max_threads) - floor = math.floor(post_count / 100) - if floor == 0: - floor = 1 - a = list(range(floor)) + ceil = math.ceil(post_count / 100) + a = list(range(ceil)) offset_array = [] for b in a: b = b * 100 offset_array.append(link.replace("offset=0", "offset=" + str(b))) - media_set = pool.starmap(scrape_array, product(offset_array, [session])) - media_set = [x for x in media_set if x is not None] - media_set = list(chain.from_iterable(media_set)) + media_set = format_media_set(pool.starmap(scrape_array, product( + offset_array, [session], [media_type]))) directory = j_directory - directory += "/"+site_name + "/"+username+"/"\ - + location+"/" - if "/sites/" == j_directory: - directory = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + directory - else: - directory = directory + if post_count: + user_directory = directory+"/"+site_name + "/"+username+"/" + metadata_directory = user_directory+"/metadata/" + directory = user_directory + location+"/" + if "/sites/" == j_directory: + user_directory = os.path.dirname(os.path.dirname( + os.path.realpath(__file__))) + user_directory + metadata_directory = os.path.dirname(os.path.dirname( + os.path.realpath(__file__))) + metadata_directory + directory = os.path.dirname(os.path.dirname( + os.path.realpath(__file__))) + directory - print("DIRECTORY - " + directory) - if not os.path.exists(directory): - os.makedirs(directory) + if not only_links: + print("DIRECTORY - " + directory) + os.makedirs(directory, exist_ok=True) + os.makedirs(metadata_directory, exist_ok=True) - with open(directory+'links.json', 'w') as outfile: - json.dump(media_set, outfile) + with open(metadata_directory+location+".json", 'w') as outfile: + json.dump(media_set, outfile) return [media_set, directory] @@ -193,8 +230,6 @@ def download_media(media, session, directory, username): while True: link = media["link"] r = session.head(link) - if r.status_code != 200: - return file_name = link.rsplit('/', 1)[-1] result = file_name.split("_", 1) if len(result) > 1: @@ -206,7 +241,7 @@ def download_media(media, session, directory, username): ext = ext.replace(".", "") date_object = datetime.strptime(media["postedAt"], "%d-%m-%Y %H:%M:%S") directory = reformat(directory, file_name, - media["text"], ext, date_object, username) + media["text"], ext, date_object, username, format_path, date_format, text_length, maximum_length) timestamp = date_object.timestamp() if not overwrite_files: if os.path.isfile(directory): @@ -219,32 +254,12 @@ def download_media(media, session, directory, username): if chunk: # filter out keep-alive new chunks f.write(chunk) os_name = platform.system() - if os_name != "macOS": + if os_name == "Windows": setctime(directory, timestamp) print(link) return True -def reformat(directory2, file_name2, text, ext, date, username): - path = format_path.replace("{username}", username) - text = BeautifulSoup(text, 'html.parser').get_text().replace( - "\n", " ").strip() - filtered_text = re.sub(r'[\\/*?:"<>|]', '', text) - path = path.replace("{text}", filtered_text) - date = date.strftime(date_format) - path = path.replace("{date}", date) - path = path.replace("{file_name}", file_name2) - path = path.replace("{ext}", ext) - directory2 += path - count_string = len(directory2) - if count_string > 259: - num_sum = count_string - 259 - directory2 = directory2.replace( - filtered_text, filtered_text[:-num_sum]) - - return directory2 - - def show_error(error): print(error["error"]["message"]) return