Skip to content

Commit

Permalink
Features and QoL
Browse files Browse the repository at this point in the history
If you choose to scrape all users, the script logic now does the following:

1. Scrape ALL users' links
2. Download ALL the users' links

[config]
Added scrape_all.
Set to true if you'd like to scrape all the names.

The script can now run automatically if you fill in auto_site_choice, auto_choice and auto_scrape_all.

5 second delay has been implemented between each task so you don't get a 404.
  • Loading branch information
SecretShell committed Nov 7, 2019
1 parent ebfa317 commit 37f8c46
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 130 deletions.
8 changes: 7 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,13 @@ auto_choice:

You can automatically choose what you want to scrape if you add it in the config file.

|**NEW**| export_type:
|**NEW**| auto_scrape_all:

Default = false

If set to true, the script will scrape all the names.

export_type:

Default = "json"

Expand Down
20 changes: 16 additions & 4 deletions StartDatascraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import timeit
import json
import logging
import time

# Configure logging to the console and file system at INFO level and above
logging.basicConfig(handlers=[logging.FileHandler('application.log', 'w', 'utf-8')], level=logging.INFO,
Expand Down Expand Up @@ -48,6 +49,8 @@
site_name = site_names[x]
json_auth = json_sites[site_name]["auth"]
json_site_settings = json_sites[site_name]["settings"]
auto_scrape_all = json_site_settings["auto_scrape_all"]
only_links = json_site_settings["auto_scrape_all"]
session = ""
x = ""
app_token = ""
Expand Down Expand Up @@ -77,20 +80,29 @@
names = array[0]
if names:
print("Names: "+array[1])
value = int(input().strip())
if not scrape_all:
value = int(input().strip())
else:
value = 0
if value:
names = [names[value]]
else:
names.pop(0)
else:
print('Input a '+site_name+' '+session[1])
names = [input().strip()]
start_time = timeit.default_timer()
download_list = []
for name in names:
username = helpers.parse_links(site_name, name)
start_time = timeit.default_timer()
result = x.start_datascraper(
session[0], username, site_name, app_token)
stop_time = str(int(timeit.default_timer() - start_time) / 60)
print('Task Completed in ' + stop_time + ' Minutes')
download_list.append(result)
for y in download_list:
for arg in y[1]:
x.download_media(*arg)
stop_time = str(int(timeit.default_timer() - start_time) / 60)
print('Task Completed in ' + stop_time + ' Minutes')
time.sleep(5)
except KeyboardInterrupt as e:
print("Exiting Script")
3 changes: 3 additions & 0 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
},
"settings": {
"auto_choice": "",
"auto_scrape_all": false,
"directory": "",
"file_name_format": "{file_name}.{ext}",
"text_length": "",
Expand All @@ -29,6 +30,7 @@
},
"settings": {
"auto_choice": "",
"auto_scrape_all": false,
"directory": "",
"file_name_format": "{file_name}.{ext}",
"text_length": "",
Expand All @@ -41,6 +43,7 @@
"auth": {},
"settings": {
"auto_choice": "",
"auto_scrape_all": false,
"directory": "",
"file_name_format": "{file_name}.{ext}",
"text_length": "",
Expand Down
141 changes: 76 additions & 65 deletions modules/four_chan.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@


def start_datascraper(session, board_name, site_name, link_type=None):
print("Scrape Processing")
user_id = link_check(session, board_name)
if not user_id[0]:
print(user_id[1])
Expand Down Expand Up @@ -69,14 +70,14 @@ def start_datascraper(session, board_name, site_name, link_type=None):
threads = pool.starmap(thread_scraper,
product(threads, [board_name], [session], [directory]))
threads = [x for x in threads if x is not None]
print("Filtered Count: "+str(len(threads)))
post_count = len(threads)
print("Valid Count: "+str(post_count))
print("Downloading Media")
results = pool.starmap(download_media,
product(threads, [session], [directory], [board_name]))
count_results = str(len([x for x in threads if x is None]))
print("Valid Count: "+count_results)
print("Invalid Count: "+count_results)
prep_download = [[threads, session, directory, board_name]]
# When profile is done scraping, this function will return True
return [True, link_array]
return [True, prep_download]


def link_check(session, username):
Expand Down Expand Up @@ -173,69 +174,79 @@ def thread_scraper(thread_id, board_name, session, directory):
return thread


def download_media(thread, session, directory, board_name):
try:
directory = thread["download_path"]+"/"
valid = False
name_key = "filename"
for post in thread["posts"]:
if name_key in post:
post["tim"] = str(post["tim"])
post[name_key] = re.sub(
r'[\\/*?:"<>|]', '', post[name_key])
ext = post["ext"].replace(".", "")
filename = post["tim"]+"."+ext
link = "http://i.4cdn.org/" + board_name + "/" + filename
filename = post[name_key]+"."+ext
download_path = directory+filename
count_string = len(download_path)
if count_string > maximum_length:
num_sum = count_string - maximum_length
name_key = "tim"
download_path = directory+post[name_key]+"."+ext
def download_media(media_set, session, directory, board_name):
def download(thread, session, directory):
try:
directory = thread["download_path"]+"/"
valid = False
name_key = "filename"
for post in thread["posts"]:
if name_key in post:
post["tim"] = str(post["tim"])
post[name_key] = re.sub(
r'[\\/*?:"<>|]', '', post[name_key])
ext = post["ext"].replace(".", "")
filename = post["tim"]+"."+ext
link = "http://i.4cdn.org/" + board_name + "/" + filename
filename = post[name_key]+"."+ext
download_path = directory+filename
count_string = len(download_path)
if count_string > maximum_length:
num_sum = count_string - maximum_length
name_key = "tim"
download_path = directory+post[name_key]+"."+ext

if not overwrite_files:
count = 1
found = False
og_filename = post[name_key]
while True:
if os.path.isfile(download_path):
remote_size = post["fsize"]
local_size = os.path.getsize(download_path)
if remote_size == local_size:
found = True
break
if not overwrite_files:
count = 1
found = False
og_filename = post[name_key]
while True:
if os.path.isfile(download_path):
remote_size = post["fsize"]
local_size = os.path.getsize(download_path)
if remote_size == local_size:
found = True
break
else:
download_path = directory+og_filename + \
" ("+str(count)+")."+ext
count += 1
continue
else:
download_path = directory+og_filename + \
" ("+str(count)+")."+ext
count += 1
continue
else:
found = False
break
if found:
continue
r = session.get(link, stream=True)
if r.status_code != 404:
if not os.path.exists(os.path.dirname(download_path)):
os.makedirs(os.path.dirname(download_path))
with open(download_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
logger.info("Link: {}".format(link))
logger.info("Path: {}".format(download_path))
valid = True
if valid:
os.makedirs(directory, exist_ok=True)
with open(directory+'archive.json', 'w') as outfile:
json.dump(thread, outfile)
return thread
else:
found = False
break
if found:
continue
r = session.get(link, stream=True)
if r.status_code != 404:
if not os.path.exists(os.path.dirname(download_path)):
os.makedirs(os.path.dirname(download_path))
with open(download_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
logger.info("Link: {}".format(link))
logger.info("Path: {}".format(download_path))
valid = True
if valid:
os.makedirs(directory, exist_ok=True)
with open(directory+'archive.json', 'w') as outfile:
json.dump(thread, outfile)
return thread
else:
return
except Exception as e:
print("ERROR", e, directory)
return
except Exception as e:
print("ERROR", e, directory)
return
print("Download Processing")
print("Name: "+board_name)
print("Directory: " + directory)
# print("Downloading "+post_count+" "+location)
if multithreading:
pool = ThreadPool(max_threads)
else:
pool = ThreadPool(1)
pool.starmap(download, product(media_set, [session], [directory]))


def create_session():
Expand Down
67 changes: 39 additions & 28 deletions modules/justforfans.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@


def start_datascraper(session, username, site_name, app_token=None):
print("Scrape Processing")
user_id = link_check(session, username)
if not user_id[0]:
print(user_id[1])
Expand All @@ -47,23 +48,23 @@ def start_datascraper(session, username, site_name, app_token=None):
post_count = user_id[2]
array = scrape_choice(username, post_count)
link_array = {}
prep_download = []
for item in array:
item[1].append(username)
only_links = item[1][4]
post_count = str(item[1][5])
item[1].pop(3)
response = media_scraper(session, site_name, only_links, *item[1])
link_array[item[1][1].lower()] = response[0]
if not only_links:
media_set = response[0]
if not media_set["valid"]:
continue
directory = response[1]
if multithreading:
pool = ThreadPool(max_threads)
else:
pool = ThreadPool(1)
pool.starmap(download_media, product(
media_set["valid"], [session], [directory], [username]))
location = item[1][1]
prep_download.append([media_set["valid"], session, directory, username, post_count, location])
# When profile is done scraping, this function will return True
return [True, link_array]
return [True, prep_download]


def link_check(session, username):
Expand Down Expand Up @@ -259,28 +260,38 @@ def media_scraper(session, site_name, only_links, link, location, media_type, di
return [media_set, directory]


def download_media(media, session, directory, username):
while True:
link = media["link"]
r = session.head(link)
def download_media(media_set, session, directory, username, post_count, location):
def download(media, session, directory, username):
while True:
link = media["link"]
r = session.head(link)

date_object = datetime.strptime(media["postedAt"], "%d-%m-%Y %H:%M:%S")
directory = media["directory"]+media["filename"]
timestamp = date_object.timestamp()
if not overwrite_files:
if os.path.isfile(directory):
return
if not os.path.exists(os.path.dirname(directory)):
os.makedirs(os.path.dirname(directory))
r = session.get(link, allow_redirects=True, stream=True)
with open(directory, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
format_image(directory, timestamp)
logger.info("Link: {}".format(link))
logger.info("Path: {}".format(directory))
return True
date_object = datetime.strptime(media["postedAt"], "%d-%m-%Y %H:%M:%S")
directory = media["directory"]+media["filename"]
timestamp = date_object.timestamp()
if not overwrite_files:
if os.path.isfile(directory):
return
if not os.path.exists(os.path.dirname(directory)):
os.makedirs(os.path.dirname(directory))
r = session.get(link, allow_redirects=True, stream=True)
with open(directory, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
format_image(directory, timestamp)
logger.info("Link: {}".format(link))
logger.info("Path: {}".format(directory))
return True
print("Download Processing")
print("Name: "+username)
print("Directory: " + directory)
print("Downloading "+post_count+" "+location)
if multithreading:
pool = ThreadPool(max_threads)
else:
pool = ThreadPool(1)
pool.starmap(download, product(media_set, [session], [directory], [username]))


def create_session(user_agent, phpsessid, user_hash2):
Expand Down
Loading

0 comments on commit 37f8c46

Please sign in to comment.