Skip to content

Commit

Permalink
version2 of the scraper.
Browse files Browse the repository at this point in the history
Increased scraping capabilities
  • Loading branch information
yash1337 authored Apr 1, 2017
1 parent 8e49015 commit ab368b9
Showing 1 changed file with 253 additions and 0 deletions.
253 changes: 253 additions & 0 deletions src/ScraperV2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
#############################
##Project Site 'Cyclone" Scraper v2
##This thing Scrapes everthing from the CSE 232 site videos, examples code, presentations, and project files
##
#############################

################################################################################################################

###Checking dependensies
import re
import requests
from bs4 import BeautifulSoup #getting the beasutiful soup library
from requests.auth import HTTPBasicAuth
from urllib.parse import urljoin
import os
import msvcrt as m
# try:
# print("Importing other libraries")
# from requests.auth import HTTPBasicAuth
# from urllib.parse import urljoin
# import os
# import msvcrt as m
# except:
# print("Just get those libraries man!!!")

# try:
# import requests
# print ("Request library is good!! ")
# except:
# print("Request Library not found. Trying to install it. If not successfull then try it manually")
# os.system('pip install requests')

# try:
# from bs4 import BeautifulSoup
# print("BeautifulSoup library is good!! ")
# except:
# print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually")
# os.system('pip install beautifulsoup4')




################################################################################################################

################################################################################################################
UNAME="sharmay4"
PASSWORD="D@rkL0rd"
################################################################################################################

################################################################################################################

###Actual Code

baseURL='http://www.cse.msu.edu/~cse232/'

def returnBeautifiedObject(link,USERNAME='',PASS=''):
requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS))
return BeautifulSoup(requestObject.text,"html.parser")

def downloadFiles(URL,PATH,book_name=''):
if book_name=='':
book_name=URL.split('/')[-1]

os.chdir(PATH)

print("Downloading: "+book_name+" ...")
with open(book_name, 'wb') as book:
a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD))
for block in a.iter_content(512):
if not block:
break
book.write(block)
def downloadVideos(URL,PATH,book_name=''):


r=requests.get(URL)
print("Downloading: "+book_name+" ...")
f=open(book_name,'wb');
print ("Donloading.....")
for chunk in r.iter_content(chunk_size=255):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print ("Done")
f.close()



def getWorksheets(beautifiedBaseObject):
worksheetURL=''
worksheetFolderPath=os.path.dirname(os.path.abspath(__file__))+'\worksheets'
#print(worksheetFolderPath)
if not os.path.exists(worksheetFolderPath):
os.makedirs(worksheetFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'Worksheets' in urljoin(baseURL,link['href']):
worksheetURL=urljoin(baseURL,link['href'])
worksheetHTMLObject=requests.get(worksheetURL)
completedWorksheetURL=''
#print(worksheetURL+'/'+'w1.cpp')
beautifiedWorksheetHTMLObject=BeautifulSoup(worksheetHTMLObject.text,"html.parser")

for link in beautifiedWorksheetHTMLObject.find_all('a'):
if (".pdf" in link['href'] or ".cpp" in link['href']):
completedWorksheetURL=worksheetURL+'/'+link['href']
book_name = completedWorksheetURL.split('/')[-1]
book_name=book_name[:1]+'orksheet'+book_name[1:]
worksheetName=book_name
worksheetFilePath=worksheetFolderPath+"\\"+book_name.split('.',1)[0]
if not os.path.exists(worksheetFilePath):
os.makedirs(worksheetFilePath)
downloadFiles(completedWorksheetURL,worksheetFilePath,book_name)
# print("Downloading: "+book_name+" ...")
# with open(book_name, 'wb') as book:
# a = requests.get(completedWorksheetURL)
# for block in a.iter_content(512):
# if not block:
# break
# book.write(block)



def getLabs(beautifiedBaseObject):
labBaseURLList=[]
labFolderpath=os.path.dirname(os.path.abspath(__file__))+'\labs'
if not os.path.exists(labFolderpath):
os.makedirs(labFolderpath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
labBaseURLList.append(urljoin(baseURL,link['href']))
labBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedLabObject=None
for URL in labBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if ('lab' in link['href'] and 'Weekly' in link['href']):
beautifiedLabObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
for labLink in beautifiedLabObject.find_all('a'):
if (".pdf" in labLink['href'] or ".cpp" in labLink['href'] or ".h" in labLink['href'] or "gdbinit" in labLink['href'] or '.txt' in labLink['href']):
labFileURL=link['href']+"/"+ labLink['href']
if not os.path.exists(labFolderpath+"\\"+URL.split('/')[-2]):
os.makedirs(labFolderpath+"\\"+URL.split('/')[-2])
labFilePath=labFolderpath+"\\"+URL.split('/')[-2]
#downloadFiles(labFileURL,labFilePath)


def getReadings(beautifiedBaseObject):
readingBaseURLList=[]
readingFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\readings'
if not os.path.exists(readingFolderPath):
os.makedirs(readingFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
readingBaseURLList.append(urljoin(baseURL,link['href']))
readingBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedReadingObject=None
for URL in readingBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for link in tempBeautifiedObject.find_all('a'):
if ('reading' in link['href']):
beautifiedReadingObject=returnBeautifiedObject(link['href'],UNAME,PASSWORD)
for readingLink in beautifiedReadingObject.find_all('a'):
if (".pdf" in readingLink['href'] or ".cpp" in readingLink['href'] or ".h" in readingLink['href'] or '.txt' in readingLink['href']):
readingFileURL=link['href']+"/"+readingLink['href']
if not os.path.exists(readingFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(readingFolderPath+'\\'+URL.split('/')[-2])
readingFilePath=readingFolderPath+'\\'+URL.split('/')[-2]
downloadFiles(readingFileURL,readingFilePath)


def getVideos(beautifiedBaseObject):
videoBaseURLList=[]
videoFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\videos'
if not os.path.exists(videoFolderPath):
os.makedirs(videoFolderPath)

for link in beautifiedBaseObject.find_all('a'):
if 'week' in link['href'] and '.pdf' not in link['href']:
videoBaseURLList.append(urljoin(baseURL,link['href']))
videoBaseURLList.pop(0) #deleting the URL without week number
tempBeautifiedObject=None
beautifiedVideoObject=None
dtDict={}
ulLinkSet=set()
for URL in videoBaseURLList:
tempBeautifiedObject=returnBeautifiedObject(URL,UNAME,PASSWORD)
for liLink in tempBeautifiedObject.find_all('a'):
if ('video' in liLink['href'] and '.mp4' in liLink['href']):
temp=URL[:-URL.index('/')]
if not os.path.exists(videoFolderPath+"\\"+URL.split('/')[-2]):
os.makedirs(videoFolderPath+"\\"+URL.split('/')[-2])
videoFilePath=videoFolderPath+"\\"+URL.split('/')[-2]
videoFileURL=temp[:temp.rfind('/')]+"/"+liLink['href']
bookName=liLink.text.strip()+".mp4"
bookName=bookName.replace('\n','').replace('\t','')
bookName=re.sub(' +',' ',bookName).capitalize()
bookName=re.sub('[^a-zA-Z0-9 \n\.]', '', bookName)
#print(videoFileURL)
#print(videoFilePath)
#print(bookName)
#print()
downloadFiles(videoFileURL,videoFilePath,bookName)




#print(ulLinkSet)
beautifiedBaseObject=returnBeautifiedObject(baseURL)
#basestHTMLRequestObject=requests.get('http://www.cse.msu.edu/~cse232/')
#beautifiedbasestHTMLRequestObject=BeautifulSoup(basestHTMLRequestObject.text,"html.parser")
#print(beautifiedbasestHTMLRequestObject)
#getWorksheets(beautifiedbasestHTMLRequestObject)
#getLabs(beautifiedBaseObject)
getVideos(beautifiedBaseObject)



































0 comments on commit ab368b9

Please sign in to comment.