-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawler.py
240 lines (212 loc) · 9.97 KB
/
Crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
"""
Collect webm from 4chan and broadcast them to FM radio using raspberry pi
I know, this sounds like a bad idea and it is,
but at this point there is no way back now
"""
import hashlib
import html
import json
import logging
import pickle
import re
import time
import _thread
import requests
import os
import argparse
import ffmpeg
from urllib.request import urlretrieve
# following is the line where you define your searches
search_word = "ygyl"
search_type = ["webm"]
search_board = ["wsg", "gif"]
class Piradio4Chan:
storage_file = "storage.pkl"
def __init__(self, keyword, input_file_types, input_boards, input_folder=None):
self.headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
self.keyword = keyword
self.file_hashes = []
self.file_types = input_file_types
self.titles = {}
self.reposts = []
self.boards = input_boards
self.playlist = []
self.index = 0
self.download_count = 0
self.refresh_rate = 600
self.base_dir = os.getcwd()
self.folder = input_folder
self.sleep_max = 10
self.sleep_count = 0
logging.basicConfig(level=logging.WARNING,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename='links.log',
filemode='a')
def store_to_file(self):
''' method to store existing info to file'''
data_to_store = {}
data_to_store['file_hashes'] = self.file_hashes
data_to_store['titles'] = self.titles
data_to_store['reposts'] = self.reposts
pickle.dump(data_to_store, open(self.storage_file, 'wb'))
def start(self):
if not self.folder:
# remove special letters and then make camel case
folder = ''.join(x for x in self.keyword.title() if not x.isspace())
self.folder = re.sub(r'[\W_]+', '', folder)[:15]
self.folder_dir = os.path.join(self.base_dir, self.folder)
self.storage_file = self.folder_dir+"/"+self.storage_file
if os.path.isfile(self.storage_file):
data_to_load = pickle.load(open(self.storage_file, "rb"))
# check if data is dict, if using old format, convert to dict
if type(data_to_load) == dict:
self.file_hashes = data_to_load['file_hashes']
self.titles = data_to_load['titles']
self.reposts = data_to_load['reposts']
else:
self.file_hashes = data_to_load
self.store_to_file()
print("Updated save file. Please rerun process again")
return
print("Searching for {} in board {} with file type {}".format(self.keyword, self.boards, self.file_types))
if not os.path.exists(self.folder_dir):
print("Creating folder: {}".format(self.folder_dir))
os.mkdir(self.folder_dir)
# start the downloader
_thread.start_new_thread(self.download, ("Downloader", 1))
# start the collector
while True:
self.collect()
def download(self, name, thread_num):
while True:
if self.index < len(self.playlist):
self.sleep_count = 0
selected = self.playlist[self.index]
self.index += 1
board, thread_id, post_id, file = selected.split("@")
link = "boards.4chan.org/{}/thread/{}".format(board, thread_id)
filename = file.rsplit("/", 1)[-1]
local_filename = self.folder_dir+"/"+filename
if os.path.isfile(self.folder_dir+"/"+filename):
print(f"Skipping {filename} for existing local file.")
continue
if filename in self.reposts:
print(f'Skipping {filename} for it being a known repost.')
continue
print("Downloading:{} : No.{}. Downloaded {} files.".format(link, post_id,self.download_count))
try:
time.sleep(1)
urlretrieve(file, local_filename)
sha1 = hashlib.sha1()
with open(local_filename, 'rb') as f:
while True:
data = f.read(65536)
if not data:
break
sha1.update(data)
new_hash = sha1.hexdigest()
if new_hash in self.file_hashes:
print("{} is a repost REEEEE!!".format(filename))
self.reposts.append(filename)
self.store_to_file()
os.remove(local_filename)
continue
if filename.endswith('.webm'):
metadata = ffmpeg.probe(local_filename)
if 'title' in metadata['format']['tags']:
duration = float(metadata['format']['duration'])
song_title = metadata['format']['tags']['title']
if song_title in self.titles.keys():
duration_diff = abs(self.titles[song_title] - duration)
if duration_diff < 1:
print("{} is an edited repost REEEEE!!".format(filename))
self.reposts.append(filename)
self.store_to_file()
os.remove(filename)
continue
else:
self.titles[song_title] = duration
else:
self.titles[song_title] = duration
self.file_hashes.append(new_hash)
# done processing, save result
self.store_to_file()
self.download_count += 1
except OSError:
logging.warning("Unable to download {}.".format(str(file)))
else:
logging.info("Downloaded {}.".format(str(file)))
# if playlist tasks finished increase sleep time
else:
sleep_time = pow(2, self.sleep_count)
print("Empty job list, retrying after {} seconds...".format(sleep_time))
time.sleep(sleep_time)
if self.sleep_count < self.sleep_max:
self.sleep_count += 1
# adding new match to the list
def collect(self):
for board in self.boards:
for page_num in range(1, 20):
try:
page_data = requests.get('https://a.4cdn.org/{}/{}.json'.format(board, page_num), headers=self.headers)
page_data = json.loads(page_data.content.decode('utf-8'))
except json.decoder.JSONDecodeError: # we reached the max page, break
break
threads = page_data["threads"]
print('>>Searching https://a.4cdn.org/{}/{}.json'.format(board, page_num))
for thread in threads:
# boolean checker to see if current post meets qualification
qualified = False
# original poster
op = thread["posts"][0]
# op's thread id
op_id = op["no"]
# check title and op name
if self.keyword in (op['semantic_url']):
qualified = True
if "name" in op:
if self.keyword.upper() in html.unescape(op['name']).upper():
qualified = True
if "sub" in op:
if self.keyword.upper() in html.unescape(op['sub']).upper():
qualified = True
# check comment section if there are comments
if "com" in op:
if self.keyword.upper() in html.unescape(op["com"]).upper():
qualified = True
if qualified:
thread_data = requests.get("https://a.4cdn.org/{}/thread/{}.json"
.format(board, op_id),
headers=self.headers)
thread_response = json.loads(thread_data.content.decode('utf-8'))
# posts is a list of post
posts = thread_response["posts"]
# variable post is a dict with info about each post
for post in posts:
if "ext" in post and post["ext"][1:] in self.file_types:
download_link = "https://i.4cdn.org/{}/{}{}".format(board, post["tim"], post["ext"])
pending = board + "@" + str(op_id) + "@" + str(post["no"]) + "@" + download_link
if pending not in self.playlist:
self.playlist.append(pending)
time.sleep(1) # sleep for each page
time.sleep(10) # sleep for each board
time.sleep(300) # sleep for each rescan
return
parser = argparse.ArgumentParser()
parser.add_argument("-k", help="keywords")
parser.add_argument("-b", "--board", nargs='*', help="boards to crawl files from")
parser.add_argument("-t", "--type", nargs="*", help="file types to crawl")
parser.add_argument("-f", help="target folder")
args = parser.parse_args()
x = Piradio4Chan(search_word, search_type, search_board)
if args.k:
x.keyword = args.k
if args.board:
x.boards = args.board
if args.type:
x.file_types = args.type
if args.f:
x.folder = args.f
x.start()