-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathScraperV2.py
346 lines (271 loc) · 12.5 KB
/
ScraperV2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#############################
##Project Site 'Cyclone" Scraper v2
##This thing Scrapes everthing from the CSE 232 site Videos, Worksheets, Lecture Notes, and Project Files
##You need to supply your CSE user name and Password
#############################
################################################################################################################
###Checking dependensies
import os
try:
import requests
print ("Request library is good!! ")
print()
except:
print("Request Library not found. Trying to install it. If not successfull then try it manually")
os.system('pip install requests')
try:
print("Importing other libraries")
from requests.auth import HTTPBasicAuth
from urllib.parse import urljoin
import msvcrt as m
print()
except:
print("Just get those libraries man!!!")
try:
from bs4 import BeautifulSoup
print("BeautifulSoup library is good!! ")
print()
except:
print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually")
os.system('pip install beautifulsoup4')
import re
import requests
from bs4 import BeautifulSoup #getting the beasutiful soup library
from requests.auth import HTTPBasicAuth
from urllib.parse import urljoin
import sys
import msvcrt as m
import posixpath
import urllib.parse
################################################################################################################
################################################################################################################
UNAME=""
PASSWORD=""
################################################################################################################
################################################################################################################
###Actual Code
baseURL='http://www.cse.msu.edu/~cse232/'
basePath=os.path.dirname(os.path.abspath(__file__))
def returnBeautifiedObject(link,USERNAME='',PASS=''):
requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS))
return BeautifulSoup(requestObject.text,"html.parser")
def downloadFiles(URL,PATH,book_name=''):
if book_name=='':
book_name=URL.split('/')[-1]
os.chdir(PATH)
print("Downloading: "+book_name+" ...")
with open(book_name, 'wb') as book:
a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD))
for block in a.iter_content(512):
if not block:
break
book.write(block)
def subFolderFileDownloader(URL,folderPath):
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if not os.path.exists(folderPath):
os.makedirs(folderPath)
if ".txt" in link['href'] or ".pdf" in link['href'] or ".cpp" in link['href'] or ".h" in link['href']:
downloadFiles(URL+link['href'],folderPath)
def getWorksheets(beautifiedBaseObject):
worksheetURL=''
worksheetFolderPath=os.path.dirname(os.path.abspath(__file__))+'\Worksheets'
if not os.path.exists(worksheetFolderPath):
os.makedirs(worksheetFolderPath)
for link in beautifiedBaseObject.find_all('a'):
if 'Worksheets' in urljoin(baseURL,link['href']):
worksheetURL=urljoin(baseURL,link['href'])
worksheetHTMLObject=requests.get(worksheetURL)
completedWorksheetURL=''
beautifiedWorksheetHTMLObject=BeautifulSoup(worksheetHTMLObject.text,"html.parser")
for link in beautifiedWorksheetHTMLObject.find_all('a',href=True):
if (".pdf" in link['href'] or ".cpp" in link['href']):
completedWorksheetURL=worksheetURL+'/'+link['href']
book_name = completedWorksheetURL.split('/')[-1]
book_name=book_name[:1]+'orksheet'+book_name[1:]
worksheetName=book_name
worksheetFilePath=worksheetFolderPath+"\\"+book_name.split('.',1)[0]
if not os.path.exists(worksheetFilePath):
os.makedirs(worksheetFilePath)
downloadFiles(completedWorksheetURL,worksheetFilePath,book_name)
def getLabs(beautifiedBaseObject):
labBaseURLList=[]
labFolderpath=os.path.dirname(os.path.abspath(__file__))+'\Labs'
if not os.path.exists(labFolderpath):
os.makedirs(labFolderpath)
for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
labBaseURLList.append(urljoin(baseURL,link['href']))
labBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedLabObject=None
for URL in labBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if ('lab' in link['href'] and 'Weekly' in link['href']):
beautifiedLabObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
for labLink in beautifiedLabObject.find_all('a'):
if (".pdf" in labLink['href'] or ".cpp" in labLink['href'] or ".h" in labLink['href'] or "gdbinit" in labLink['href'] or '.txt' in labLink['href']):
labFileURL=link['href']+"/"+ labLink['href']
if not os.path.exists(labFolderpath+"\\"+URL.split('/')[-2]):
os.makedirs(labFolderpath+"\\"+URL.split('/')[-2])
labFilePath=labFolderpath+"\\"+URL.split('/')[-2]
downloadFiles(labFileURL,labFilePath)
def getReadings(beautifiedBaseObject):
readingBaseURLList=[]
readingFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Readings'
if not os.path.exists(readingFolderPath):
os.makedirs(readingFolderPath)
for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
readingBaseURLList.append(urljoin(baseURL,link['href']))
readingBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedReadingObject=None
for URL in readingBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if ('reading' in link['href']):
beautifiedReadingObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
for readingLink in beautifiedReadingObject.find_all('a'):
if (".pdf" in readingLink['href'] or ".cpp" in readingLink['href'] or ".h" in readingLink['href'] or '.txt' in readingLink['href']):
readingFileURL=link['href']+"/"+readingLink['href']
if not os.path.exists(readingFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(readingFolderPath+'\\'+URL.split('/')[-2])
readingFilePath=readingFolderPath+'\\'+URL.split('/')[-2]
downloadFiles(readingFileURL,readingFilePath)
def getVideos(beautifiedBaseObject):
videoBaseURLList=[]
videoFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Videos'
if not os.path.exists(videoFolderPath):
os.makedirs(videoFolderPath)
for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
videoBaseURLList.append(urljoin(baseURL,link['href']))
videoBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedVideoObject=None
for URL in videoBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for liLink in tempBeautifiedObject.find_all('a'):
if ('video' in liLink['href'] and '.mp4' in liLink['href']):
temp=URL[:-URL.index('/')]
if not os.path.exists(videoFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(videoFolderPath+"\\"+URL.split('/')[-2])
videoFilePath=videoFolderPath+"\\"+URL.split('/')[-2]
videoFileURL=temp[:temp.rfind('/')]+"/"+liLink['href']
bookName=liLink.text.strip()+".mp4"
bookName=bookName.replace('\n','').replace('\t','')
bookName=re.sub(' +',' ',bookName).capitalize()
bookName=re.sub('[^a-zA-Z0-9 \n\.]', '', bookName)
downloadFiles(videoFileURL,videoFilePath,bookName)
print()
def getProjects(beautifiedBaseObject):
projectBaseURLList=[]
projectFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Projects'
if not os.path.exists(projectFolderPath):
os.makedirs(projectFolderPath)
for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
projectBaseURLList.append(urljoin(baseURL,link['href']))
projectBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedProjectObject=None
for URL in projectBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for projectBaseLink in tempBeautifiedObject.find_all('a'):
if 'project' in projectBaseLink['href'] and 'pdf' not in projectBaseLink['href']:
projectBeautifiedObject=returnBeautifiedObject(projectBaseLink['href'],UNAME,PASSWORD)
for link in projectBeautifiedObject.find_all('a'):
if ".txt" in link['href'] or ".pdf" in link['href'] or ".cpp" in link['href'] or ".h" in link['href']:
if not os.path.exists(projectFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(projectFolderPath+"\\"+URL.split('/')[-2])
projectFilePath=projectFolderPath+"\\"+URL.split('/')[-2]
projectFileURL=projectBaseLink['href']+"/"+link['href']
downloadFiles(projectFileURL,projectFilePath)
if 'test' in link['href']: #for getting the test files
if not os.path.exists(projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests"):
os.makedirs(projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests")
testFilePath=projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests"
testFileURL=projectBaseLink['href']+"/"+link['href']
print()
subFolderFileDownloader(testFileURL,testFilePath)
print()
def menu():
print("The list of choices are as follows. Choose the ones you like one by one. Don't worry you will get future chances (just as in life!!): \n")
print("1. All the worksheets!\n")
print("2. All the Lab files!\n")
print("3. All the Lecture slides!\n")
print("4. All the videos! (be carefull the total size is over 8 gigs!)\n")
print("5. All the Project Files!\n")
print("q. Nothing fam. Just get me out of here!\n")
def main():
print("Welcome to CSE232 Scraper!!!!!!\n")
menu()
userInput=(input("So what do you want (not in life. What do you want right now!!!): "))
print()
beautifiedBaseObject=returnBeautifiedObject(baseURL)
while(True):
if(userInput=='1'):
print("Alright lets get those worksheets!!!\n")
getWorksheets(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the worsksheets are downloaded in the \"Worksheets\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='2'):
print("Alright lets get those lab files!!!\n")
getLabs(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the Lab Files are downloaded in the \"Labs\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='3'):
print("Alright lets get those lecture slides!!!\n")
getReadings(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the Leture Notes are downloaded in the \"Readings\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='4'):
print("Alright lets get those videos!!!. Again this will take a lot of time. Professor Punch used a good camera lol\n")
getVideos(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the Videos are downloaded in the \"Videos\" folder. See if you want anything else!' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='5'):
print("Alright lets get those project files!!!\n")
getProjects(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright Project Files are downloaded in the \"Projects\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='q'):
print('\n\n')
print("Alright fam! Take care! Be happy! Live long and prosper!!!!!!!!!!!!!")
print('\n')
inp=input("BTW do you want to delete this scraper file so that your credentials don't get into wrong hands???? (y/n): ")
if inp=='y':
os.remove(os.path.basename(sys.argv[0]))
print('\n\n')
break
else:
print("Seriously fam, Seriously. Pick one of the choices will ya!!\n")
os.chdir(basePath)
menu()
print("Let's try this again\n")
userInput=(input("Do you need anything else????: "))
print('\n\n')
if __name__ == "__main__":
main()