From 433128f778554abc3ce1d1d4633bbffda64a5974 Mon Sep 17 00:00:00 2001 From: CRIMINAL Date: Mon, 21 Oct 2019 16:36:00 +0100 Subject: [PATCH] Export to CSV Added the option to export to CSV. Check the readme for the "export_type" options Update config :^) --- README.md | 9 +++++++++ config.json | 1 + modules/four_chan.py | 2 +- modules/helpers.py | 30 +++++++++++++++++++++++++++++- modules/justforfans.py | 8 +++----- modules/onlyfans.py | 9 +++------ 6 files changed, 46 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index aeeb78d03..21843e8f7 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,15 @@ auto_choice: c = Videos You can automatically choose what you want to scrape if you add it in the config file. + +|**NEW**| export_type: + + Default = "json" + + a = "json" + b = "csv" + + You can export an archive to different formats. overwrite_files: diff --git a/config.json b/config.json index 66fc95522..5751149d4 100644 --- a/config.json +++ b/config.json @@ -2,6 +2,7 @@ "settings": { "auto_site_choice": "", "auto_choice": "", + "export_type": "json", "multithreading": true, "user-agent": "" }, diff --git a/modules/four_chan.py b/modules/four_chan.py index 302378d47..b18dc1af8 100644 --- a/modules/four_chan.py +++ b/modules/four_chan.py @@ -1,6 +1,6 @@ import requests from bs4 import BeautifulSoup -from modules.helpers import reformat +from modules.helpers import * import os import json diff --git a/modules/helpers.py b/modules/helpers.py index 0ed4672b2..2b65dfffb 100644 --- a/modules/helpers.py +++ b/modules/helpers.py @@ -2,8 +2,14 @@ import os from bs4 import BeautifulSoup import platform +import csv +import itertools +import json - +# Open config.json and fill in OPTIONAL information +json_config = json.load(open('config.json')) +json_global_settings = json_config["settings"] +export_type = json_global_settings["export_type"] def parse_links(site_name, input_link): if site_name in {"onlyfans", "justforfans"}: username = input_link.rsplit('/', 1)[-1] @@ -65,9 +71,31 @@ def format_media_set(media_set): x["invalid"].extend(y[1]) return x + def format_image(directory, timestamp): os_name = platform.system() if os_name == "Windows": from win32_setctime import setctime setctime(directory, timestamp) + +def export_archive(data, archive_directory): + # Not Finished + if export_type == "json": + with open(archive_directory+".json", 'w') as outfile: + json.dump(data, outfile) + if export_type == "csv": + with open(archive_directory+'.csv', mode='w', newline='') as csv_file: + fieldnames = [] + if data["valid"]: + fieldnames.extend(data["valid"][0].keys()) + elif data["invalid"]: + fieldnames.extend(data["invalid"][0].keys()) + header = [""]+fieldnames + if len(fieldnames) > 1: + writer = csv.DictWriter(csv_file, fieldnames=header) + writer.writeheader() + for item in data["valid"]: + writer.writerow({**{"": "valid"}, **item}) + for item in data["invalid"]: + writer.writerow({**{"": "invalid"}, **item}) diff --git a/modules/justforfans.py b/modules/justforfans.py index ca58f07d7..280b76567 100644 --- a/modules/justforfans.py +++ b/modules/justforfans.py @@ -1,7 +1,6 @@ import requests from bs4 import BeautifulSoup -from modules.helpers import reformat -from modules.helpers import format_media_set +from modules.helpers import * import os import json @@ -238,9 +237,8 @@ def media_scraper(session, site_name, only_links, link, location, media_type, di print("DIRECTORY - " + directory) os.makedirs(directory, exist_ok=True) os.makedirs(metadata_directory, exist_ok=True) - - with open(metadata_directory+location+".json", 'w') as outfile: - json.dump(media_set, outfile) + archive_directory = metadata_directory+location + export_archive(media_set, archive_directory) return [media_set, directory] diff --git a/modules/onlyfans.py b/modules/onlyfans.py index 21fcb78cc..7fecff76c 100644 --- a/modules/onlyfans.py +++ b/modules/onlyfans.py @@ -1,8 +1,6 @@ import requests from bs4 import BeautifulSoup -from modules.helpers import reformat -from modules.helpers import format_media_set -from modules.helpers import format_image +from modules.helpers import * import os import json @@ -219,9 +217,8 @@ def media_scraper(session, site_name, only_links, link, location, media_type, di print("DIRECTORY - " + directory) os.makedirs(directory, exist_ok=True) os.makedirs(metadata_directory, exist_ok=True) - - with open(metadata_directory+location+".json", 'w') as outfile: - json.dump(media_set, outfile) + archive_directory = metadata_directory+location + export_archive(media_set, archive_directory) return [media_set, directory]