Skip to content

Commit

Permalink
Even Faster Than Before
Browse files Browse the repository at this point in the history
Script will do multiple requests at once to find all the media. You'll notice the improve for accounts that have 200 + posts.
I was scraping an account with 300+ and it took 20+ seconds. With this update, it takes 10 seconds.
  • Loading branch information
SecretShell committed Sep 14, 2019
1 parent 93633ad commit aab45e3
Showing 1 changed file with 55 additions and 43 deletions.
98 changes: 55 additions & 43 deletions OnlyFans.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from win32_setctime import setctime

import os
import json
from itertools import product
from itertools import chain
import multiprocessing
from multiprocessing import current_process, Pool
from multiprocessing.dummy import Pool as ThreadPool
import requests
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
from datetime import datetime
import re
import logging
import inspect
import time
from win32_setctime import setctime
import math


# Open config.json and fill in mandatory information for the script to work
json_config = json.load(open('config.json'))
Expand Down Expand Up @@ -42,7 +46,7 @@
session.cookies.set(**auth_cookie)
session.headers = {
'User-Agent': user_agent, 'Referer': 'https://onlyfans.com/'}
logging.basicConfig(filename='errors.log', level=logging.DEBUG,
logging.basicConfig(filename='errors.log', level=logging.ERROR,
format='%(asctime)s %(levelname)s %(name)s %(message)s')


Expand All @@ -69,6 +73,7 @@ def link_check():
else:
temp_user_id2[0] = True
temp_user_id2[1] = str(y["id"])
temp_user_id2[2] = y["postsCount"]
return temp_user_id2


Expand Down Expand Up @@ -127,38 +132,48 @@ def reformat(directory2, file_name2, text, ext, date):
return directory2


def media_scraper(link, location, directory, only_links):
next_page = True
next_offset = 0
media_set = dict([])
media_count = 0
def scrape_array(link):
media_set = []
master_date = "00-00-0000"
while next_page:
offset = next_offset
r = session.get(link)
y = json.loads(r.text)
if not y:
break
for media_api in y:
for media in media_api["media"]:
if "source" in media:
file = media["source"]["source"]
if "ca2.convert" in file:
file = media["preview"]
media_set[media_count] = {}
media_set[media_count]["link"] = file
if media_api["postedAt"] == "-001-11-30T00:00:00+00:00":
dt = master_date
else:
dt = datetime.fromisoformat(media_api["postedAt"]).replace(tzinfo=None).strftime(
"%d-%m-%Y %H:%M:%S")
master_date = dt
media_set[media_count]["text"] = media_api["text"]
media_set[media_count]["postedAt"] = dt
media_count += 1
next_offset = offset + 100
link = link.replace("offset="+str(offset), "offset="+str(next_offset))
# path = reformat(media_set)
r = session.get(link)
y = json.loads(r.text)
if not y:
return
for media_api in y:
for media in media_api["media"]:
if "source" in media:
file = media["source"]["source"]
if "ca2.convert" in file:
file = media["preview"]
new_dict = dict()
new_dict["post_id"] = media_api["id"]
new_dict["link"] = file
if media_api["postedAt"] == "-001-11-30T00:00:00+00:00":
dt = master_date
else:
dt = datetime.fromisoformat(media_api["postedAt"]).replace(tzinfo=None).strftime(
"%d-%m-%Y %H:%M:%S")
master_date = dt
new_dict["text"] = media_api["text"]
new_dict["postedAt"] = dt
media_set.append(new_dict)
return media_set


def media_scraper(link, location, directory, only_links):
max_threads = multiprocessing.cpu_count()
pool = ThreadPool(max_threads)
floor = math.floor(post_count / 100)
if floor == 0:
floor = 1
a = list(range(floor))
offset_array = []
for b in a:
b = b * 100
offset_array.append(link.replace("offset=0", "offset=" + str(b)))
media_set = pool.starmap(scrape_array, product(offset_array))
media_set = [x for x in media_set if x is not None]
media_set = list(chain.from_iterable(media_set))
if "/Users/" == directory:
directory = os.path.dirname(os.path.realpath(__file__))+"/Users/"+username+"/"+location+"/"
else:
Expand All @@ -171,15 +186,11 @@ def media_scraper(link, location, directory, only_links):
with open(directory+'links.json', 'w') as outfile:
json.dump(media_set, outfile)
if not only_links:
max_threads = multiprocessing.cpu_count()
pool = ThreadPool(max_threads)
pool.starmap(download_media, product(media_set.items(), [directory]))
pool.starmap(download_media, product(media_set, [directory]))


def download_media(media_set, directory):
def download_media(media, directory):
while True:
function_name = inspect.stack()[0][3]
media = media_set[1]
link = media["link"]
r = session.head(link)
if r.status_code != 200:
Expand All @@ -202,7 +213,6 @@ def download_media(media_set, directory):
urlretrieve(link, directory)
setctime(directory, timestamp)
print(link)
time.sleep(10)
return


Expand All @@ -216,6 +226,8 @@ def download_media(media_set, directory):
print(user_id[1])
print("First time? Did you forget to edit your config.json file?")
continue

post_count = user_id[2]
user_id = user_id[1]
scrape_choice()
print('Task Completed!')

0 comments on commit aab45e3

Please sign in to comment.