Version 1 for CSE 260 scraper

yash1337 · Jun 27, 2017 · 930567e · 930567e
1 parent 4847687
commit 930567e
Showing 1 changed file with 157 additions and 41 deletions.
diff --git a/src/CSE232/ScraperV1.py b/src/CSE232/ScraperV1.py
@@ -1,63 +1,179 @@
+#############################
+##Project Site 'Cyclone" Scraper v2
+##This thing Scrapes everthing from the CSE 232 site Videos, Worksheets, Lecture Notes, and Project Files
+##You need to supply your CSE user name and Password 
+#############################
+
+################################################################################################################
+
+###Checking dependensies
+
 import os
-#installing the BeautifulSoup and requests modules
-os.system('pip install beautifulsoup4')
-os.system('pip install requests')
-print()
+try:
+	import requests 
+	print ("Request library is good!! ")
+	print()
+except:
+	print("Request Library not found. Trying to install it. If not successfull then try it manually")
+	os.system('pip install requests')
+
+try:
+	print("Importing other libraries")
+	from requests.auth import HTTPBasicAuth
+	from urllib.parse import urljoin
+	import msvcrt as m
+	print()
+except:
+	print("Just get those libraries man!!!")
+
+try:
+	from bs4 import BeautifulSoup
+	print("BeautifulSoup library is good!! ")
+	print()
+except:
+	print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually")
+	os.system('pip install beautifulsoup4')
+
+import re
 import requests 
 from bs4 import BeautifulSoup #getting the beasutiful soup library
 from requests.auth import HTTPBasicAuth
 from urllib.parse import urljoin
+import sys
 import msvcrt as m
+import posixpath
+import urllib.parse
+
 
+################################################################################################################
 
-################################################
-baseURL=""
-USERNAME=""
-PASSWORD=""
-################################################
-
-
-#getting the HTML file using the request object
-requestObject=requests.get(baseURL,auth=HTTPBasicAuth(USERNAME,PASSWORD)) 
+################################################################################################################
+UNAME="cse260"
+PASSWORD="dmig4me"
+################################################################################################################
 
-#"Beautifying" the HTML file using the html parser
-soup=BeautifulSoup(requestObject.text,"html.parser")	
+################################################################################################################
 
-#variables to hold the links
-links=[]
-small_link=[]
+###Actual Code
 
-#getting all the <a> HTML tags
-for link in soup.find_all('a'):
-	small_link.append(link['href'])			#getting the link without the base URL
-	links.append(urljoin(baseURL,link['href']))   #joining the base and the relative URL 
+baseURL='http://www.cse.msu.edu/~cse260/cse260cn/'
+basePath=os.path.dirname(os.path.abspath(__file__))
 
+def returnBeautifiedObject(link,USERNAME='',PASS=''):
+	requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS))
+	return BeautifulSoup(requestObject.text,"html.parser")
 
-#getting the links that we are intrested in i.e. .txt .pdf and .cpp
-correctRelativeURL=[]
-correctCompletedURL=[]
-for item in small_link:
-	if ".txt" in item or ".pdf" in item or ".cpp" in item:
-		correctRelativeURL.append(item)
+def downloadFiles(URL,PATH,book_name=''):
+	if book_name=='':
+		book_name=URL.split('/')[-1]
 
-for item in links:
-	if ".txt" in item or ".pdf" in item or ".cpp" in item or ".h" in item:
-		correctCompletedURL.append(item)
-
-#looping through the links, downloading files
-for link in correctCompletedURL:
-	book_name = link.split('/')[-1]
+	os.chdir(PATH)
+
 	print("Downloading: "+book_name+" ...")
 	with open(book_name, 'wb') as book:
-		a = requests.get(link,auth=HTTPBasicAuth(USERNAME,PASSWORD))
+		a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD))
 		for block in a.iter_content(512):
 			if not block:
 				break
 			book.write(block)
 
+def getLectures(beautifiedBaseObject):
+	LectureBaseURLList=[]
+	LectureFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Lecture'
+
+	if not os.path.exists(LectureFolderPath):
+		os.makedirs(LectureFolderPath) 
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'lecture' in link['href'] and "Sample" not in link['href']:
+			LectureBaseURLList.append(urljoin(baseURL,link['href']))
+
+	for URL in LectureBaseURLList:
+		downloadFiles(URL,LectureFolderPath)
+
+def getSampleTests(beautifiedBaseObject):
+	SamTestsBaseURLList=[]
+	SamTestsFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Sample Tests'
+
+	if not os.path.exists(SamTestsFolderPath):
+		os.makedirs(SamTestsFolderPath) 
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'lecture' in link['href'] and "Sample" in link['href']:
+			SamTestsBaseURLList.append(urljoin(baseURL,link['href']))
+
+	for URL in SamTestsBaseURLList:
+		downloadFiles(URL,SamTestsFolderPath)
+
+
+def getHomework(beautifiedBaseObject):
+	HomeworkBaseURLList=[]
+	HomeworkFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Homework Assignments'
+
+	if not os.path.exists(HomeworkFolderPath):
+		os.makedirs(HomeworkFolderPath) 
+
+	for link in beautifiedBaseObject.find_all('a'):
+		if 'homework' in link['href']:
+			HomeworkBaseURLList.append(urljoin(baseURL,link['href']))
+
+	for URL in HomeworkBaseURLList:
+		downloadFiles(URL,HomeworkFolderPath)
+
+def menu():
+	print("The list of choices are as follows. Choose the ones you like one by one. Don't worry you will get future chances (just as in life!! :-)): \n")
+	print("1. All the lecture notes!\n")
+	print("2. All the Sample test files!\n")
+	print("3. All the Homework files!\n")
+	print("q. Nothing fam. Just get me out of here!\n")
+
+def main():
+	print("Welcome to CSE232 Scraper!!!!!!\n")
+	menu()	
+	userInput=(input("So what do you want (not in life. What do you want right now!!!): "))
+	print()
+	beautifiedBaseObject=returnBeautifiedObject(baseURL,UNAME,PASSWORD)
+	while(True):
+		if(userInput=='1'):
+			print("Alright lets get those Lecture files!!!\n")
+			getLectures(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the lecture notes are downloaded in the \"Lecture\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='2'):
+			print("Alright lets get those Sample test files!!!\n")
+			getSampleTests(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the Sample test files are downloaded in the \"Sample Tests\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='3'):
+			print("Alright lets get those Homework files!!!\n")
+			getHomework(beautifiedBaseObject)
+			os.chdir(basePath)
+			print()
+			print('Alright the Homework files are downloaded in the \"Homework Assignments\" folder. See if you want anything else!\n' )
+			menu()
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+		elif(userInput=='q'):
+			print('\n\n')
+			print("Alright fam! Take care! Be happy! Live long and prosper!!!!!!!!!!!!!")		
+			print('\n\n')
+			break
+		else:
+			print("Seriously fam, Seriously. Pick one of the choices will ya!!\n")
+			os.chdir(basePath)
+			menu()
+			print("Let's try this again\n")
+			userInput=(input("Do you need anything else????: "))
+			print('\n\n')
+
 
-#These lines delete the scraper file. Uncomment them if you don't want the file to be deleted			
-#os.remove(sys.argv[0])
-#print("Scraper File Removed!!!!")
-print("\nPress a key to continue...")
-m.getch()
+if __name__ == "__main__":
+    main()