From 930567e3ecceb630a36469563a47450059d038a2 Mon Sep 17 00:00:00 2001 From: Yash Sharma Date: Mon, 26 Jun 2017 21:59:35 -0400 Subject: [PATCH] Version 1 for CSE 260 scraper --- src/CSE232/ScraperV1.py | 198 +++++++++++++++++++++++++++++++--------- 1 file changed, 157 insertions(+), 41 deletions(-) diff --git a/src/CSE232/ScraperV1.py b/src/CSE232/ScraperV1.py index 6604e87..a1c8bed 100644 --- a/src/CSE232/ScraperV1.py +++ b/src/CSE232/ScraperV1.py @@ -1,63 +1,179 @@ +############################# +##Project Site 'Cyclone" Scraper v2 +##This thing Scrapes everthing from the CSE 232 site Videos, Worksheets, Lecture Notes, and Project Files +##You need to supply your CSE user name and Password +############################# + +################################################################################################################ + +###Checking dependensies + import os -#installing the BeautifulSoup and requests modules -os.system('pip install beautifulsoup4') -os.system('pip install requests') -print() +try: + import requests + print ("Request library is good!! ") + print() +except: + print("Request Library not found. Trying to install it. If not successfull then try it manually") + os.system('pip install requests') + +try: + print("Importing other libraries") + from requests.auth import HTTPBasicAuth + from urllib.parse import urljoin + import msvcrt as m + print() +except: + print("Just get those libraries man!!!") + +try: + from bs4 import BeautifulSoup + print("BeautifulSoup library is good!! ") + print() +except: + print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually") + os.system('pip install beautifulsoup4') + +import re import requests from bs4 import BeautifulSoup #getting the beasutiful soup library from requests.auth import HTTPBasicAuth from urllib.parse import urljoin +import sys import msvcrt as m +import posixpath +import urllib.parse + +################################################################################################################ -################################################ -baseURL="" -USERNAME="" -PASSWORD="" -################################################ - - -#getting the HTML file using the request object -requestObject=requests.get(baseURL,auth=HTTPBasicAuth(USERNAME,PASSWORD)) +################################################################################################################ +UNAME="cse260" +PASSWORD="dmig4me" +################################################################################################################ -#"Beautifying" the HTML file using the html parser -soup=BeautifulSoup(requestObject.text,"html.parser") +################################################################################################################ -#variables to hold the links -links=[] -small_link=[] +###Actual Code -#getting all the HTML tags -for link in soup.find_all('a'): - small_link.append(link['href']) #getting the link without the base URL - links.append(urljoin(baseURL,link['href'])) #joining the base and the relative URL +baseURL='http://www.cse.msu.edu/~cse260/cse260cn/' +basePath=os.path.dirname(os.path.abspath(__file__)) +def returnBeautifiedObject(link,USERNAME='',PASS=''): + requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS)) + return BeautifulSoup(requestObject.text,"html.parser") -#getting the links that we are intrested in i.e. .txt .pdf and .cpp -correctRelativeURL=[] -correctCompletedURL=[] -for item in small_link: - if ".txt" in item or ".pdf" in item or ".cpp" in item: - correctRelativeURL.append(item) +def downloadFiles(URL,PATH,book_name=''): + if book_name=='': + book_name=URL.split('/')[-1] -for item in links: - if ".txt" in item or ".pdf" in item or ".cpp" in item or ".h" in item: - correctCompletedURL.append(item) - -#looping through the links, downloading files -for link in correctCompletedURL: - book_name = link.split('/')[-1] + os.chdir(PATH) + print("Downloading: "+book_name+" ...") with open(book_name, 'wb') as book: - a = requests.get(link,auth=HTTPBasicAuth(USERNAME,PASSWORD)) + a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD)) for block in a.iter_content(512): if not block: break book.write(block) +def getLectures(beautifiedBaseObject): + LectureBaseURLList=[] + LectureFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Lecture' + + if not os.path.exists(LectureFolderPath): + os.makedirs(LectureFolderPath) + + for link in beautifiedBaseObject.find_all('a'): + if 'lecture' in link['href'] and "Sample" not in link['href']: + LectureBaseURLList.append(urljoin(baseURL,link['href'])) + + for URL in LectureBaseURLList: + downloadFiles(URL,LectureFolderPath) + +def getSampleTests(beautifiedBaseObject): + SamTestsBaseURLList=[] + SamTestsFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Sample Tests' + + if not os.path.exists(SamTestsFolderPath): + os.makedirs(SamTestsFolderPath) + + for link in beautifiedBaseObject.find_all('a'): + if 'lecture' in link['href'] and "Sample" in link['href']: + SamTestsBaseURLList.append(urljoin(baseURL,link['href'])) + + for URL in SamTestsBaseURLList: + downloadFiles(URL,SamTestsFolderPath) + + +def getHomework(beautifiedBaseObject): + HomeworkBaseURLList=[] + HomeworkFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Homework Assignments' + + if not os.path.exists(HomeworkFolderPath): + os.makedirs(HomeworkFolderPath) + + for link in beautifiedBaseObject.find_all('a'): + if 'homework' in link['href']: + HomeworkBaseURLList.append(urljoin(baseURL,link['href'])) + + for URL in HomeworkBaseURLList: + downloadFiles(URL,HomeworkFolderPath) + +def menu(): + print("The list of choices are as follows. Choose the ones you like one by one. Don't worry you will get future chances (just as in life!! :-)): \n") + print("1. All the lecture notes!\n") + print("2. All the Sample test files!\n") + print("3. All the Homework files!\n") + print("q. Nothing fam. Just get me out of here!\n") + +def main(): + print("Welcome to CSE232 Scraper!!!!!!\n") + menu() + userInput=(input("So what do you want (not in life. What do you want right now!!!): ")) + print() + beautifiedBaseObject=returnBeautifiedObject(baseURL,UNAME,PASSWORD) + while(True): + if(userInput=='1'): + print("Alright lets get those Lecture files!!!\n") + getLectures(beautifiedBaseObject) + os.chdir(basePath) + print() + print('Alright the lecture notes are downloaded in the \"Lecture\" folder. See if you want anything else!\n' ) + menu() + userInput=(input("Do you need anything else????: ")) + print('\n\n') + elif(userInput=='2'): + print("Alright lets get those Sample test files!!!\n") + getSampleTests(beautifiedBaseObject) + os.chdir(basePath) + print() + print('Alright the Sample test files are downloaded in the \"Sample Tests\" folder. See if you want anything else!\n' ) + menu() + userInput=(input("Do you need anything else????: ")) + print('\n\n') + elif(userInput=='3'): + print("Alright lets get those Homework files!!!\n") + getHomework(beautifiedBaseObject) + os.chdir(basePath) + print() + print('Alright the Homework files are downloaded in the \"Homework Assignments\" folder. See if you want anything else!\n' ) + menu() + userInput=(input("Do you need anything else????: ")) + print('\n\n') + elif(userInput=='q'): + print('\n\n') + print("Alright fam! Take care! Be happy! Live long and prosper!!!!!!!!!!!!!") + print('\n\n') + break + else: + print("Seriously fam, Seriously. Pick one of the choices will ya!!\n") + os.chdir(basePath) + menu() + print("Let's try this again\n") + userInput=(input("Do you need anything else????: ")) + print('\n\n') + -#These lines delete the scraper file. Uncomment them if you don't want the file to be deleted -#os.remove(sys.argv[0]) -#print("Scraper File Removed!!!!") -print("\nPress a key to continue...") -m.getch() +if __name__ == "__main__": + main() \ No newline at end of file