From f24c063b3511f3d5befacf95eded9540188c07ca Mon Sep 17 00:00:00 2001 From: CRIMINAL Date: Mon, 23 Nov 2020 09:32:35 +0000 Subject: [PATCH] OFRenamer fix and increased speed Basically the script was moving files to the new location, but deleting it straight after. Some cloud services would get confused and start putting the remote file in the remote root folder because we deleted the local file before the cloud service could update the location. This was a bug I had for a year now and I finally found the issue and fixed it and now I have to delete thousands of duplicate images and folders whilst lagging :) You'll only notice the speed increase if scrape_names is False and scrape_paid_content is True. We skip getting the subscriber list if you're only scraping paid content. Metadata kept duplicated posts that were deleted by the model. I've fixed that. Some metadata files should decrease in size. --- apis/onlyfans/onlyfans.py | 7 +- datascraper/main_datascraper.py | 12 +-- extras/OFRenamer/start.py | 11 +-- helpers/main_helper.py | 25 +++++-- modules/onlyfans.py | 125 ++++++++++++++++---------------- 5 files changed, 95 insertions(+), 85 deletions(-) diff --git a/apis/onlyfans/onlyfans.py b/apis/onlyfans/onlyfans.py index 8c48a2a02..1defba59a 100644 --- a/apis/onlyfans/onlyfans.py +++ b/apis/onlyfans/onlyfans.py @@ -208,7 +208,7 @@ class start(): def __init__(self, sessions=[], custom_request=callable) -> None: sessions = api_helper.copy_sessions(sessions) self.sessions = sessions - self.auth = None + self.auth = {} self.custom_request = custom_request self.auth_details = None self.max_threads = -1 @@ -381,7 +381,7 @@ def get_subscriptions(self, refresh=True, extra_info=True, limit=20, offset=0): if not refresh: subscriptions = authed.get( "subscriptions") - if subscriptions: + if subscriptions != None: return subscriptions link = links(global_limit=limit, global_offset=offset).subscriptions session = self.sessions[0] @@ -433,6 +433,7 @@ def multi(item, session=None): return valid_subscriptions pool = api_helper.multiprocessing() # offset_array = api_helper.assign_session(offset_array, self.sessions,key_two="session",show_item=True) + # offset_array= offset_array[:16] results += pool.starmap(multi, product( offset_array, [session])) @@ -566,7 +567,7 @@ def get_mass_messages(self, resume=None, refresh=True, limit=10, offset=0): link = links(global_limit=limit, global_offset=offset).mass_messages_api results = self.request(link=link) - items = results.get("list",[]) + items = results.get("list", []) if not items: return items if resume: diff --git a/datascraper/main_datascraper.py b/datascraper/main_datascraper.py index 621587d9a..4fe656c24 100644 --- a/datascraper/main_datascraper.py +++ b/datascraper/main_datascraper.py @@ -93,6 +93,7 @@ def start_datascraper(): site_name = "OnlyFans" subscription_array = [] auth_count = -1 + jobs = json_site_settings["jobs"] for json_auth in json_auth_array: api = OnlyFans.start( original_sessions) @@ -108,18 +109,17 @@ def start_datascraper(): setup = module.account_setup(api) if not setup: continue - jobs = json_site_settings["jobs"] if jobs["scrape_names"]: array = module.manage_subscriptions(api, auth_count) subscription_array += array - if jobs["scrape_paid_content"]: - paid_contents = api.get_paid_content() - paid_content = module.paid_content_scraper(api) apis.append(api) subscription_list = module.format_options( subscription_array, "usernames") - x = main_helper.process_names( - module, subscription_list, auto_scrape_names, json_auth_array, apis, json_config, site_name_lower, site_name) + if jobs["scrape_paid_content"]: + paid_content = module.paid_content_scraper(apis) + if jobs["scrape_names"]: + x = main_helper.process_names( + module, subscription_list, auto_scrape_names, json_auth_array, apis, json_config, site_name_lower, site_name) x = main_helper.process_downloads(apis, module) print elif site_name_lower == "starsavn": diff --git a/extras/OFRenamer/start.py b/extras/OFRenamer/start.py index 970ffcb7d..ecd5873bd 100644 --- a/extras/OFRenamer/start.py +++ b/extras/OFRenamer/start.py @@ -85,7 +85,7 @@ def update(old_filepath, new_filepath): def start(subscription, api_type, api_path, site_name, json_settings): metadata = getattr(subscription.scraped, api_type) download_info = subscription.download_info - base_directory = download_info["directory"] + root_directory = download_info["directory"] date_format = json_settings["date_format"] text_length = json_settings["text_length"] reformats = {} @@ -99,7 +99,7 @@ def start(subscription, api_type, api_path, site_name, json_settings): option["username"] = username option["date_format"] = date_format option["maximum_length"] = text_length - option["directory"] = base_directory + option["directory"] = root_directory formatted = format_types(reformats).check_unique() unique = formatted["unique"] for key, value in reformats.items(): @@ -107,10 +107,11 @@ def start(subscription, api_type, api_path, site_name, json_settings): reformats[key] = value.split(key2, 1)[0]+key2 print print - a, b, c = prepare_reformat(option, keep_vars=True).reformat(reformats) + a, base_directory, c = prepare_reformat(option, keep_vars=True).reformat(reformats) + download_info["base_directory"] = base_directory print all_files = [] - for root, subdirs, files in os.walk(b): + for root, subdirs, files in os.walk(base_directory): x = [os.path.join(root, x) for x in files] all_files.extend(x) for media_type, value in metadata: @@ -118,7 +119,7 @@ def start(subscription, api_type, api_path, site_name, json_settings): continue for status, value2 in value: fixed, new_directories = fix_directories( - value2, base_directory, site_name, api_path, media_type, username, all_files, json_settings) + value2, root_directory, site_name, api_path, media_type, username, all_files, json_settings) for new_directory in new_directories: directory = os.path.abspath(new_directory) os.makedirs(directory, exist_ok=True) diff --git a/helpers/main_helper.py b/helpers/main_helper.py index a357933bd..d19ea1f64 100644 --- a/helpers/main_helper.py +++ b/helpers/main_helper.py @@ -167,8 +167,9 @@ def format_image(filepath, timestamp): if os_name == "Windows": from win32_setctime import setctime setctime(filepath, timestamp) - print(filepath) + print(f"Updated Creation Time {filepath}") os.utime(filepath, (timestamp, timestamp)) + print(f"Updated Modification Time {filepath}") except Exception as e: continue break @@ -398,9 +399,10 @@ def update_config(json_config, file_name="config.json"): def choose_auth(array): - string = "" names = [] array = [{"auth_count": -1, "username": "All"}]+array + string = "" + seperator = " | " name_count = len(array) if name_count > 1: @@ -410,7 +412,7 @@ def choose_auth(array): string += str(count)+" = "+name names.append(x) if count+1 != name_count: - string += " | " + string += seperator count += 1 @@ -426,7 +428,8 @@ def choose_auth(array): def choose_option(subscription_list, auto_scrape_names): names = subscription_list[0] if names: - print("Names: Username = username | "+subscription_list[1]) + seperator = " | " + print(f"Names: Username = username {seperator} {subscription_list[1]}") if not auto_scrape_names: value = "1" value = input().strip() @@ -469,7 +472,7 @@ def process_downloads(apis, module): if download_info: module.download_media(api, subscription) delete_empty_directories( - download_info["model_directory"]) + download_info["base_directory"]) send_webhook(subscription) @@ -558,8 +561,13 @@ def start(directory): for root, dirnames, files in os.walk(directory, topdown=False): for dirname in dirnames: full_path = os.path.realpath(os.path.join(root, dirname)) - if not os.listdir(full_path): - os.rmdir(full_path) + contents = os.listdir(full_path) + if not contents: + shutil.rmtree(full_path, ignore_errors=True) + else: + content_count = len(contents) + if content_count ==1 and "desktop.ini" in contents: + shutil.rmtree(full_path, ignore_errors=True) x = start(directory) if os.path.exists(directory): if not os.listdir(directory): @@ -576,6 +584,7 @@ def multiprocessing(): def module_chooser(domain, json_sites): string = "Site: " + seperator = " | " site_names = [] wl = ["onlyfans"] bl = ["patreon"] @@ -590,7 +599,7 @@ def module_chooser(domain, json_sites): string += str(count)+" = "+x site_names.append(x) if count+1 != site_count: - string += " | " + string += seperator count += 1 string += "x = Exit" diff --git a/modules/onlyfans.py b/modules/onlyfans.py index 9b0a634db..7cd3b318f 100644 --- a/modules/onlyfans.py +++ b/modules/onlyfans.py @@ -76,6 +76,7 @@ def account_setup(api): status = False auth = api.login() if auth: + jobs = json_settings["jobs"] profile_directory = json_global_settings["profile_directories"][0] profile_directory = os.path.abspath(profile_directory) profile_directory = os.path.join(profile_directory, auth["username"]) @@ -90,7 +91,8 @@ def account_setup(api): export_archive(mass_messages, metadata_filepath, json_settings) # chats = api.get_chats() - subscriptions = api.get_subscriptions() + if jobs["scrape_names"]: + subscriptions = api.get_subscriptions() status = True return status @@ -279,41 +281,45 @@ def profile_scraper(api, site_name, api_type, username, text_length, base_direct break -def paid_content_scraper(api): - paid_contents = api.get_paid_content(refresh=False) - results = [] - for paid_content in paid_contents: - metadata_locations = {} - author = paid_content.get("author") - author = paid_content.get("fromUser", author) - subscription = create_subscription(author) - subscription.sessions = api.sessions - subscription.download_info["directory"] = j_directory - username = subscription.username - model_directory = os.path.join(j_directory, username) - api_type = paid_content["responseType"].capitalize()+"s" - subscription.download_info["metadata_locations"] = j_directory - subscription.download_info["metadata_locations"] = metadata_locations - site_name = "OnlyFans" - media_type = format_media_types() - formatted_directories = format_directories( - j_directory, site_name, username, metadata_directory_format, media_type, api_type) - metadata_directory = formatted_directories["metadata_directory"] - metadata_path = os.path.join( - metadata_directory, api_type+".json") - metadata_locations[api_type] = metadata_path - new_metadata = media_scraper([paid_content], api, - formatted_directories, username, api_type) - for directory in new_metadata["directories"]: - os.makedirs(directory, exist_ok=True) - api_path = os.path.join(api_type, "") - new_metadata_object = process_metadata( - api, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name) - new_metadata_set = new_metadata_object.convert() - if export_metadata: - export_archive(new_metadata_set, metadata_path, json_settings) - download_media(api, subscription) - return results +def paid_content_scraper(apis): + for api in apis: + paid_contents = api.get_paid_content(check=True) + authed = api.auth + authed["subscriptions"] = authed.get("subscriptions", []) + for paid_content in paid_contents: + author = paid_content.get("author") + author = paid_content.get("fromUser", author) + subscription = api.get_subscription(author["id"]) + if not subscription: + subscription = create_subscription(author) + authed["subscriptions"].append(subscription) + api_type = paid_content["responseType"].capitalize()+"s" + api_media = getattr(subscription.scraped, api_type) + api_media.append(paid_content) + print + for subscription in authed["subscriptions"]: + string = f"Scraping - {subscription.username}" + print(string) + subscription.sessions = api.sessions + username = subscription.username + site_name = "OnlyFans" + media_type = format_media_types() + for api_type, paid_content in subscription.scraped: + formatted_directories = format_directories( + j_directory, site_name, username, metadata_directory_format, media_type, api_type) + metadata_directory = formatted_directories["metadata_directory"] + metadata_path = os.path.join( + metadata_directory, api_type+".json") + new_metadata = media_scraper(paid_content, api, + formatted_directories, username, api_type) + if new_metadata: + api_path = os.path.join(api_type, "") + new_metadata_object = process_metadata( + api, new_metadata, formatted_directories, subscription, api_type, api_path, metadata_path, site_name) + new_metadata_set = new_metadata_object.convert() + if export_metadata: + export_archive(new_metadata_set, + metadata_path, json_settings) def format_media_types(): @@ -477,12 +483,10 @@ def process_metadata(api, new_metadata, formatted_directories, subscription, api if legacy_metadata_object: new_metadata_object = compare_metadata( new_metadata_object, legacy_metadata_object) - if not subscription.download_info: - subscription.download_info["directory"] = j_directory - subscription.download_info["model_directory"] = os.path.join( - j_directory, subscription.username) - subscription.download_info["webhook"] = webhook - subscription.download_info["metadata_locations"] = {} + if not subscription.download_info: + subscription.download_info["metadata_locations"] = {} + subscription.download_info["directory"] = j_directory + subscription.download_info["webhook"] = webhook subscription.download_info["metadata_locations"][api_type] = archive_path subscription.set_scraped(api_type, new_metadata_object) new_metadata_object = ofrenamer.start( @@ -670,8 +674,17 @@ def test(new_item, old_item): new_found = None if old_item.media_id == None: for link in old_item.links: - link = link.split("?")[0] - if any(link in new_link for new_link in new_item.links): + # Handle Links + if "?" in link: + link2 = link.split("?")[0] + elif ";ip=" in link: + a = urlparse(link) + link2 = os.path.basename(a.path) + else: + link2 = link + input( + f"NEW LINK DETECTED, PLEASE OPEN AN ISSUE ON GITHUB AND PASTE THE NEW LINK THERE SO I CAN HANDLE THE LINK, THANKS.\nLINK: {link}") + if any(link2 in new_link for new_link in new_item.links): new_found = new_item break print @@ -719,8 +732,8 @@ def compare_metadata(new_metadata: media_types, old_metadata: media_types) -> me if not old_items: for a in old_status: new_found = test(new_item, a) - print - break + if new_found: + break if not new_found: old_status.append(new_item) print @@ -865,21 +878,6 @@ def media_scraper(results, api, formatted_directories, username, api_type, paren filename, ext = os.path.splitext(filename) ext = ext.__str__().replace(".", "").split('?')[0] price = new_dict["price"] - # media_directory = os.path.join( - # model_directory, sorted_directories["unsorted"]) - # new_dict["paid"] = False - # if new_dict["price"]: - # if api_type in ["Messages", "Mass Messages"]: - # new_dict["paid"] = True - # else: - # if media["id"] not in media_api["preview"] and media["canView"]: - # new_dict["paid"] = True - # if sort_free_paid_posts: - # media_directory = os.path.join( - # model_directory, sorted_directories["free"]) - # if new_dict["paid"]: - # media_directory = os.path.join( - # model_directory, sorted_directories["paid"]) new_dict["text"] = text option = {} @@ -1072,6 +1070,7 @@ def format_options(f_list, choice_type): count = 0 names = [] string = "" + seperator = " | " if name_count > 1: if "usernames" == choice_type: for x in f_list: @@ -1079,7 +1078,7 @@ def format_options(f_list, choice_type): string += str(count)+" = "+name names.append([x.auth_count, name]) if count+1 != name_count: - string += " | " + string += seperator count += 1 if "apis" == choice_type: names = f_list @@ -1090,6 +1089,6 @@ def format_options(f_list, choice_type): name = api["api_type"] string += str(count)+" = "+name if count+1 != name_count: - string += " | " + string += seperator count += 1 return [names, string]