version two of the scraper

yash1337 · Apr 2, 2017 · 65dfd69 · 65dfd69
1 parent 992cc5a
commit 65dfd69
Showing 1 changed file with 346 additions and 0 deletions.
diff --git a/src/ScraperV2.py b/src/ScraperV2.py
@@ -0,0 +1,346 @@
+#############################
+##Project Site 'Cyclone" Scraper v2
+##This thing Scrapes everthing from the CSE 232 site Videos, Worksheets, Lecture Notes, and Project Files
+##You need to supply your CSE user name and Password 
+#############################
+
+################################################################################################################
+
+###Checking dependensies
+
+import os
+try:
+	import requests 
+	print ("Request library is good!! ")
+	print()
+except:
+	print("Request Library not found. Trying to install it. If not successfull then try it manually")
+	os.system('pip install requests')
+
+try:
+	print("Importing other libraries")
+	from requests.auth import HTTPBasicAuth
+	from urllib.parse import urljoin
+	import msvcrt as m
+	print()
+except:
+	print("Just get those libraries man!!!")
+
+try:
+	from bs4 import BeautifulSoup
+	print("BeautifulSoup library is good!! ")
+	print()
+except:
+	print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually")
+	os.system('pip install beautifulsoup4')
+
+import re
+import requests 
+from bs4 import BeautifulSoup #getting the beasutiful soup library
+from requests.auth import HTTPBasicAuth
+from urllib.parse import urljoin
+import sys
+import msvcrt as m
+import posixpath
+import urllib.parse
+
+
+################################################################################################################
+
+################################################################################################################
+UNAME=""
+PASSWORD=""
+################################################################################################################
+
+################################################################################################################
+
+###Actual Code
+
+baseURL='http://www.cse.msu.edu/~cse232/'
+basePath=os.path.dirname(os.path.abspath(__file__))
+
+def returnBeautifiedObject(link,USERNAME='',PASS=''):
+	requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS))
+	return BeautifulSoup(requestObject.text,"html.parser")
+
+def downloadFiles(URL,PATH,book_name=''):
+	if book_name=='':
+		book_name=URL.split('/')[-1]
+
+	os.chdir(PATH)
+
+	print("Downloading: "+book_name+" ...")
+	with open(book_name, 'wb') as book:
+		a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD))
+		for block in a.iter_content(512):
+			if not block:
+				break
+			book.write(block)
+
+def subFolderFileDownloader(URL,folderPath):	
+	tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
+	for link in tempBeautifiedObject.find_all('a'):
+		if not os.path.exists(folderPath):
+			os.makedirs(folderPath)
+		if ".txt" in link['href'] or ".pdf" in link['href'] or ".cpp" in link['href'] or ".h" in link['href']:
+			downloadFiles(URL+link['href'],folderPath)	
+
+def getWorksheets(beautifiedBaseObject):
+	worksheetURL=''
+	worksheetFolderPath=os.path.dirname(os.path.abspath(__file__))+'\Worksheets'
+	if not os.path.exists(worksheetFolderPath):
+		os.makedirs(worksheetFolderPath)
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'Worksheets' in urljoin(baseURL,link['href']):
+			worksheetURL=urljoin(baseURL,link['href'])
+	worksheetHTMLObject=requests.get(worksheetURL)
+	completedWorksheetURL=''
+	beautifiedWorksheetHTMLObject=BeautifulSoup(worksheetHTMLObject.text,"html.parser")	
+
+	for link in beautifiedWorksheetHTMLObject.find_all('a',href=True):
+		if (".pdf" in link['href'] or ".cpp" in link['href']):
+			completedWorksheetURL=worksheetURL+'/'+link['href']
+			book_name = completedWorksheetURL.split('/')[-1]
+			book_name=book_name[:1]+'orksheet'+book_name[1:]
+			worksheetName=book_name
+			worksheetFilePath=worksheetFolderPath+"\\"+book_name.split('.',1)[0]	
+			if not os.path.exists(worksheetFilePath):
+				os.makedirs(worksheetFilePath)
+			downloadFiles(completedWorksheetURL,worksheetFilePath,book_name)
+
+
+def getLabs(beautifiedBaseObject):
+	labBaseURLList=[]	
+	labFolderpath=os.path.dirname(os.path.abspath(__file__))+'\Labs'
+	if not os.path.exists(labFolderpath):
+		os.makedirs(labFolderpath)  
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'week' in link['href'] and '.pdf' not in link['href']:
+			labBaseURLList.append(urljoin(baseURL,link['href']))
+	labBaseURLList.pop(0)	#deleting the URL without week number
+	tempBeautifiedObject=None
+	beautifiedLabObject=None
+
+	for URL in labBaseURLList:
+		tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
+		for link in tempBeautifiedObject.find_all('a'):
+			if ('lab' in link['href'] and 'Weekly' in link['href']):
+				beautifiedLabObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
+				for labLink in beautifiedLabObject.find_all('a'):
+					if (".pdf" in labLink['href'] or ".cpp" in labLink['href'] or ".h" in labLink['href'] or "gdbinit" in labLink['href'] or '.txt' in labLink['href']):
+						labFileURL=link['href']+"/"+ labLink['href']
+						if not os.path.exists(labFolderpath+"\\"+URL.split('/')[-2]):
+							os.makedirs(labFolderpath+"\\"+URL.split('/')[-2])
+						labFilePath=labFolderpath+"\\"+URL.split('/')[-2]
+						downloadFiles(labFileURL,labFilePath)
+
+
+def getReadings(beautifiedBaseObject):
+	readingBaseURLList=[]
+	readingFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Readings'
+	if not os.path.exists(readingFolderPath):
+		os.makedirs(readingFolderPath) 
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'week' in link['href'] and '.pdf' not in link['href']:
+			readingBaseURLList.append(urljoin(baseURL,link['href']))
+	readingBaseURLList.pop(0)	#deleting the URL without week number
+	tempBeautifiedObject=None
+	beautifiedReadingObject=None
+
+	for URL in readingBaseURLList:
+		tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
+		for link in tempBeautifiedObject.find_all('a'):
+			if ('reading' in link['href']):
+				beautifiedReadingObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
+				for readingLink in beautifiedReadingObject.find_all('a'):
+					if (".pdf" in readingLink['href'] or ".cpp" in readingLink['href'] or ".h" in readingLink['href'] or '.txt' in readingLink['href']):
+						readingFileURL=link['href']+"/"+readingLink['href']
+						if not os.path.exists(readingFolderPath+"\\"+URL.split('/')[-2]):
+							os.makedirs(readingFolderPath+'\\'+URL.split('/')[-2])
+						readingFilePath=readingFolderPath+'\\'+URL.split('/')[-2]
+						downloadFiles(readingFileURL,readingFilePath)
+
+
+def getVideos(beautifiedBaseObject):
+	videoBaseURLList=[]
+	videoFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Videos'
+	if not os.path.exists(videoFolderPath):
+		os.makedirs(videoFolderPath)
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'week' in link['href'] and '.pdf' not in link['href']:
+			videoBaseURLList.append(urljoin(baseURL,link['href']))
+	videoBaseURLList.pop(0)	#deleting the URL without week number
+	tempBeautifiedObject=None
+	beautifiedVideoObject=None
+
+	for URL in videoBaseURLList:
+		tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
+		for liLink in tempBeautifiedObject.find_all('a'):
+			if ('video' in liLink['href'] and '.mp4' in liLink['href']):
+				temp=URL[:-URL.index('/')]
+				if not os.path.exists(videoFolderPath+"\\"+URL.split('/')[-2]):
+					os.makedirs(videoFolderPath+"\\"+URL.split('/')[-2])
+				videoFilePath=videoFolderPath+"\\"+URL.split('/')[-2]
+				videoFileURL=temp[:temp.rfind('/')]+"/"+liLink['href']
+				bookName=liLink.text.strip()+".mp4"
+				bookName=bookName.replace('\n','').replace('\t','')
+				bookName=re.sub(' +',' ',bookName).capitalize()
+				bookName=re.sub('[^a-zA-Z0-9 \n\.]', '', bookName)
+				downloadFiles(videoFileURL,videoFilePath,bookName)
+				print()
+
+
+def getProjects(beautifiedBaseObject):
+	projectBaseURLList=[]
+	projectFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Projects'
+	if not os.path.exists(projectFolderPath):
+		os.makedirs(projectFolderPath)
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'week' in link['href'] and '.pdf' not in link['href']:
+			projectBaseURLList.append(urljoin(baseURL,link['href']))
+	projectBaseURLList.pop(0)	#deleting the URL without week number
+	tempBeautifiedObject=None
+	beautifiedProjectObject=None
+
+	for URL in projectBaseURLList:
+		tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
+		for projectBaseLink in tempBeautifiedObject.find_all('a'):
+			if 'project' in projectBaseLink['href'] and 'pdf' not in projectBaseLink['href']:
+				projectBeautifiedObject=returnBeautifiedObject(projectBaseLink['href'],UNAME,PASSWORD)
+				for link in projectBeautifiedObject.find_all('a'):
+					if ".txt" in link['href'] or ".pdf" in link['href'] or ".cpp" in link['href'] or ".h" in link['href']:
+						if not os.path.exists(projectFolderPath+"\\"+URL.split('/')[-2]):
+							os.makedirs(projectFolderPath+"\\"+URL.split('/')[-2])
+						projectFilePath=projectFolderPath+"\\"+URL.split('/')[-2]
+						projectFileURL=projectBaseLink['href']+"/"+link['href']
+						downloadFiles(projectFileURL,projectFilePath)
+					if 'test' in link['href']:  #for getting the test files
+						if not os.path.exists(projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests"):
+							os.makedirs(projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests")
+						testFilePath=projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests"
+						testFileURL=projectBaseLink['href']+"/"+link['href']
+						print()
+						subFolderFileDownloader(testFileURL,testFilePath)
+				print()
+
+def menu():
+	print("The list of choices are as follows. Choose the ones you like one by one. Don't worry you will get future chances (just as in life!!): \n")
+	print("1. All the worksheets!\n")
+	print("2. All the Lab files!\n")
+	print("3. All the Lecture slides!\n")
+	print("4. All the videos! (be carefull the total size is over 8 gigs!)\n")
+	print("5. All the Project Files!\n")
+	print("q. Nothing fam. Just get me out of here!\n")
+
+def main():
+	print("Welcome to CSE232 Scraper!!!!!!\n")
+	menu()	
+	userInput=(input("So what do you want (not in life. What do you want right now!!!): "))
+	print()
+	beautifiedBaseObject=returnBeautifiedObject(baseURL)
+	while(True):
+		if(userInput=='1'):
+			print("Alright lets get those worksheets!!!\n")
+			getWorksheets(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the worsksheets are downloaded in the \"Worksheets\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='2'):
+			print("Alright lets get those lab files!!!\n")
+			getLabs(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the Lab Files are downloaded in the \"Labs\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='3'):
+			print("Alright lets get those lecture slides!!!\n")
+			getReadings(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the Leture Notes are downloaded in the \"Readings\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='4'):
+			print("Alright lets get those videos!!!. Again this will take a lot of time. Professor Punch used a good camera lol\n")
+			getVideos(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the Videos are downloaded in the \"Videos\" folder. See if you want anything else!' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='5'):
+			print("Alright lets get those project files!!!\n")
+			getProjects(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright Project Files are downloaded in the \"Projects\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='q'):
+			print('\n\n')
+			print("Alright fam! Take care! Be happy! Live long and prosper!!!!!!!!!!!!!")
+			print('\n')
+			inp=input("BTW do you want to delete this scraper file so that your credentials don't get into wrong hands???? (y/n): ")
+			if inp=='y':
+				os.remove(os.path.basename(sys.argv[0]))
+			print('\n\n')
+			break
+		else:
+			print("Seriously fam, Seriously. Pick one of the choices will ya!!\n")
+			os.chdir(basePath)
+			menu()
+			print("Let's try this again\n")
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+
+
+if __name__ == "__main__":
+    main()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+