Skip to content

Commit

Permalink
version two of the scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
yash1337 authored Apr 2, 2017
1 parent 992cc5a commit 65dfd69
Showing 1 changed file with 346 additions and 0 deletions.
346 changes: 346 additions & 0 deletions src/ScraperV2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,346 @@
#############################
##Project Site 'Cyclone" Scraper v2
##This thing Scrapes everthing from the CSE 232 site Videos, Worksheets, Lecture Notes, and Project Files
##You need to supply your CSE user name and Password
#############################

################################################################################################################

###Checking dependensies

import os
try:
import requests
print ("Request library is good!! ")
print()
except:
print("Request Library not found. Trying to install it. If not successfull then try it manually")
os.system('pip install requests')

try:
print("Importing other libraries")
from requests.auth import HTTPBasicAuth
from urllib.parse import urljoin
import msvcrt as m
print()
except:
print("Just get those libraries man!!!")

try:
from bs4 import BeautifulSoup
print("BeautifulSoup library is good!! ")
print()
except:
print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually")
os.system('pip install beautifulsoup4')

import re
import requests
from bs4 import BeautifulSoup #getting the beasutiful soup library
from requests.auth import HTTPBasicAuth
from urllib.parse import urljoin
import sys
import msvcrt as m
import posixpath
import urllib.parse


################################################################################################################

################################################################################################################
UNAME=""
PASSWORD=""
################################################################################################################

################################################################################################################

###Actual Code

baseURL='http://www.cse.msu.edu/~cse232/'
basePath=os.path.dirname(os.path.abspath(__file__))

def returnBeautifiedObject(link,USERNAME='',PASS=''):
requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS))
return BeautifulSoup(requestObject.text,"html.parser")

def downloadFiles(URL,PATH,book_name=''):
if book_name=='':
book_name=URL.split('/')[-1]

os.chdir(PATH)

print("Downloading: "+book_name+" ...")
with open(book_name, 'wb') as book:
a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD))
for block in a.iter_content(512):
if not block:
break
book.write(block)

def subFolderFileDownloader(URL,folderPath):
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if not os.path.exists(folderPath):
os.makedirs(folderPath)
if ".txt" in link['href'] or ".pdf" in link['href'] or ".cpp" in link['href'] or ".h" in link['href']:
downloadFiles(URL+link['href'],folderPath)

def getWorksheets(beautifiedBaseObject):
worksheetURL=''
worksheetFolderPath=os.path.dirname(os.path.abspath(__file__))+'\Worksheets'
if not os.path.exists(worksheetFolderPath):
os.makedirs(worksheetFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'Worksheets' in urljoin(baseURL,link['href']):
worksheetURL=urljoin(baseURL,link['href'])
worksheetHTMLObject=requests.get(worksheetURL)
completedWorksheetURL=''
beautifiedWorksheetHTMLObject=BeautifulSoup(worksheetHTMLObject.text,"html.parser")

for link in beautifiedWorksheetHTMLObject.find_all('a',href=True):
if (".pdf" in link['href'] or ".cpp" in link['href']):
completedWorksheetURL=worksheetURL+'/'+link['href']
book_name = completedWorksheetURL.split('/')[-1]
book_name=book_name[:1]+'orksheet'+book_name[1:]
worksheetName=book_name
worksheetFilePath=worksheetFolderPath+"\\"+book_name.split('.',1)[0]
if not os.path.exists(worksheetFilePath):
os.makedirs(worksheetFilePath)
downloadFiles(completedWorksheetURL,worksheetFilePath,book_name)


def getLabs(beautifiedBaseObject):
labBaseURLList=[]
labFolderpath=os.path.dirname(os.path.abspath(__file__))+'\Labs'
if not os.path.exists(labFolderpath):
os.makedirs(labFolderpath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
labBaseURLList.append(urljoin(baseURL,link['href']))
labBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedLabObject=None

for URL in labBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if ('lab' in link['href'] and 'Weekly' in link['href']):
beautifiedLabObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
for labLink in beautifiedLabObject.find_all('a'):
if (".pdf" in labLink['href'] or ".cpp" in labLink['href'] or ".h" in labLink['href'] or "gdbinit" in labLink['href'] or '.txt' in labLink['href']):
labFileURL=link['href']+"/"+ labLink['href']
if not os.path.exists(labFolderpath+"\\"+URL.split('/')[-2]):
os.makedirs(labFolderpath+"\\"+URL.split('/')[-2])
labFilePath=labFolderpath+"\\"+URL.split('/')[-2]
downloadFiles(labFileURL,labFilePath)


def getReadings(beautifiedBaseObject):
readingBaseURLList=[]
readingFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Readings'
if not os.path.exists(readingFolderPath):
os.makedirs(readingFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
readingBaseURLList.append(urljoin(baseURL,link['href']))
readingBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedReadingObject=None

for URL in readingBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if ('reading' in link['href']):
beautifiedReadingObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
for readingLink in beautifiedReadingObject.find_all('a'):
if (".pdf" in readingLink['href'] or ".cpp" in readingLink['href'] or ".h" in readingLink['href'] or '.txt' in readingLink['href']):
readingFileURL=link['href']+"/"+readingLink['href']
if not os.path.exists(readingFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(readingFolderPath+'\\'+URL.split('/')[-2])
readingFilePath=readingFolderPath+'\\'+URL.split('/')[-2]
downloadFiles(readingFileURL,readingFilePath)


def getVideos(beautifiedBaseObject):
videoBaseURLList=[]
videoFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Videos'
if not os.path.exists(videoFolderPath):
os.makedirs(videoFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
videoBaseURLList.append(urljoin(baseURL,link['href']))
videoBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedVideoObject=None

for URL in videoBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for liLink in tempBeautifiedObject.find_all('a'):
if ('video' in liLink['href'] and '.mp4' in liLink['href']):
temp=URL[:-URL.index('/')]
if not os.path.exists(videoFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(videoFolderPath+"\\"+URL.split('/')[-2])
videoFilePath=videoFolderPath+"\\"+URL.split('/')[-2]
videoFileURL=temp[:temp.rfind('/')]+"/"+liLink['href']
bookName=liLink.text.strip()+".mp4"
bookName=bookName.replace('\n','').replace('\t','')
bookName=re.sub(' +',' ',bookName).capitalize()
bookName=re.sub('[^a-zA-Z0-9 \n\.]', '', bookName)
downloadFiles(videoFileURL,videoFilePath,bookName)
print()


def getProjects(beautifiedBaseObject):
projectBaseURLList=[]
projectFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Projects'
if not os.path.exists(projectFolderPath):
os.makedirs(projectFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
projectBaseURLList.append(urljoin(baseURL,link['href']))
projectBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedProjectObject=None

for URL in projectBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for projectBaseLink in tempBeautifiedObject.find_all('a'):
if 'project' in projectBaseLink['href'] and 'pdf' not in projectBaseLink['href']:
projectBeautifiedObject=returnBeautifiedObject(projectBaseLink['href'],UNAME,PASSWORD)
for link in projectBeautifiedObject.find_all('a'):
if ".txt" in link['href'] or ".pdf" in link['href'] or ".cpp" in link['href'] or ".h" in link['href']:
if not os.path.exists(projectFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(projectFolderPath+"\\"+URL.split('/')[-2])
projectFilePath=projectFolderPath+"\\"+URL.split('/')[-2]
projectFileURL=projectBaseLink['href']+"/"+link['href']
downloadFiles(projectFileURL,projectFilePath)
if 'test' in link['href']: #for getting the test files
if not os.path.exists(projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests"):
os.makedirs(projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests")
testFilePath=projectFolderPath+"\\"+URL.split('/')[-2]+"\\"+"tests"
testFileURL=projectBaseLink['href']+"/"+link['href']
print()
subFolderFileDownloader(testFileURL,testFilePath)
print()

def menu():
print("The list of choices are as follows. Choose the ones you like one by one. Don't worry you will get future chances (just as in life!!): \n")
print("1. All the worksheets!\n")
print("2. All the Lab files!\n")
print("3. All the Lecture slides!\n")
print("4. All the videos! (be carefull the total size is over 8 gigs!)\n")
print("5. All the Project Files!\n")
print("q. Nothing fam. Just get me out of here!\n")

def main():
print("Welcome to CSE232 Scraper!!!!!!\n")
menu()
userInput=(input("So what do you want (not in life. What do you want right now!!!): "))
print()
beautifiedBaseObject=returnBeautifiedObject(baseURL)
while(True):
if(userInput=='1'):
print("Alright lets get those worksheets!!!\n")
getWorksheets(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the worsksheets are downloaded in the \"Worksheets\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='2'):
print("Alright lets get those lab files!!!\n")
getLabs(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the Lab Files are downloaded in the \"Labs\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='3'):
print("Alright lets get those lecture slides!!!\n")
getReadings(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the Leture Notes are downloaded in the \"Readings\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='4'):
print("Alright lets get those videos!!!. Again this will take a lot of time. Professor Punch used a good camera lol\n")
getVideos(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright the Videos are downloaded in the \"Videos\" folder. See if you want anything else!' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='5'):
print("Alright lets get those project files!!!\n")
getProjects(beautifiedBaseObject)
os.chdir(basePath)
print()
print('Alright Project Files are downloaded in the \"Projects\" folder. See if you want anything else!\n' )
menu()
userInput=(input("Do you need anything else????: "))
print('\n\n')
elif(userInput=='q'):
print('\n\n')
print("Alright fam! Take care! Be happy! Live long and prosper!!!!!!!!!!!!!")
print('\n')
inp=input("BTW do you want to delete this scraper file so that your credentials don't get into wrong hands???? (y/n): ")
if inp=='y':
os.remove(os.path.basename(sys.argv[0]))
print('\n\n')
break
else:
print("Seriously fam, Seriously. Pick one of the choices will ya!!\n")
os.chdir(basePath)
menu()
print("Let's try this again\n")
userInput=(input("Do you need anything else????: "))
print('\n\n')


if __name__ == "__main__":
main()



































0 comments on commit 65dfd69

Please sign in to comment.