-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
157 additions
and
41 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,63 +1,179 @@ | ||
############################# | ||
##Project Site 'Cyclone" Scraper v2 | ||
##This thing Scrapes everthing from the CSE 232 site Videos, Worksheets, Lecture Notes, and Project Files | ||
##You need to supply your CSE user name and Password | ||
############################# | ||
|
||
################################################################################################################ | ||
|
||
###Checking dependensies | ||
|
||
import os | ||
#installing the BeautifulSoup and requests modules | ||
os.system('pip install beautifulsoup4') | ||
os.system('pip install requests') | ||
print() | ||
try: | ||
import requests | ||
print ("Request library is good!! ") | ||
print() | ||
except: | ||
print("Request Library not found. Trying to install it. If not successfull then try it manually") | ||
os.system('pip install requests') | ||
|
||
try: | ||
print("Importing other libraries") | ||
from requests.auth import HTTPBasicAuth | ||
from urllib.parse import urljoin | ||
import msvcrt as m | ||
print() | ||
except: | ||
print("Just get those libraries man!!!") | ||
|
||
try: | ||
from bs4 import BeautifulSoup | ||
print("BeautifulSoup library is good!! ") | ||
print() | ||
except: | ||
print("BeautifulSoup Library not found. Trying to install it. If not successfull then try it manually") | ||
os.system('pip install beautifulsoup4') | ||
|
||
import re | ||
import requests | ||
from bs4 import BeautifulSoup #getting the beasutiful soup library | ||
from requests.auth import HTTPBasicAuth | ||
from urllib.parse import urljoin | ||
import sys | ||
import msvcrt as m | ||
import posixpath | ||
import urllib.parse | ||
|
||
|
||
################################################################################################################ | ||
|
||
################################################ | ||
baseURL="" | ||
USERNAME="" | ||
PASSWORD="" | ||
################################################ | ||
|
||
|
||
#getting the HTML file using the request object | ||
requestObject=requests.get(baseURL,auth=HTTPBasicAuth(USERNAME,PASSWORD)) | ||
################################################################################################################ | ||
UNAME="cse260" | ||
PASSWORD="dmig4me" | ||
################################################################################################################ | ||
|
||
#"Beautifying" the HTML file using the html parser | ||
soup=BeautifulSoup(requestObject.text,"html.parser") | ||
################################################################################################################ | ||
|
||
#variables to hold the links | ||
links=[] | ||
small_link=[] | ||
###Actual Code | ||
|
||
#getting all the <a> HTML tags | ||
for link in soup.find_all('a'): | ||
small_link.append(link['href']) #getting the link without the base URL | ||
links.append(urljoin(baseURL,link['href'])) #joining the base and the relative URL | ||
baseURL='http://www.cse.msu.edu/~cse260/cse260cn/' | ||
basePath=os.path.dirname(os.path.abspath(__file__)) | ||
|
||
def returnBeautifiedObject(link,USERNAME='',PASS=''): | ||
requestObject=requests.get(link,auth=HTTPBasicAuth(USERNAME,PASS)) | ||
return BeautifulSoup(requestObject.text,"html.parser") | ||
|
||
#getting the links that we are intrested in i.e. .txt .pdf and .cpp | ||
correctRelativeURL=[] | ||
correctCompletedURL=[] | ||
for item in small_link: | ||
if ".txt" in item or ".pdf" in item or ".cpp" in item: | ||
correctRelativeURL.append(item) | ||
def downloadFiles(URL,PATH,book_name=''): | ||
if book_name=='': | ||
book_name=URL.split('/')[-1] | ||
|
||
for item in links: | ||
if ".txt" in item or ".pdf" in item or ".cpp" in item or ".h" in item: | ||
correctCompletedURL.append(item) | ||
|
||
#looping through the links, downloading files | ||
for link in correctCompletedURL: | ||
book_name = link.split('/')[-1] | ||
os.chdir(PATH) | ||
|
||
print("Downloading: "+book_name+" ...") | ||
with open(book_name, 'wb') as book: | ||
a = requests.get(link,auth=HTTPBasicAuth(USERNAME,PASSWORD)) | ||
a = requests.get(URL,auth=HTTPBasicAuth(UNAME,PASSWORD)) | ||
for block in a.iter_content(512): | ||
if not block: | ||
break | ||
book.write(block) | ||
|
||
def getLectures(beautifiedBaseObject): | ||
LectureBaseURLList=[] | ||
LectureFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Lecture' | ||
|
||
if not os.path.exists(LectureFolderPath): | ||
os.makedirs(LectureFolderPath) | ||
|
||
for link in beautifiedBaseObject.find_all('a'): | ||
if 'lecture' in link['href'] and "Sample" not in link['href']: | ||
LectureBaseURLList.append(urljoin(baseURL,link['href'])) | ||
|
||
for URL in LectureBaseURLList: | ||
downloadFiles(URL,LectureFolderPath) | ||
|
||
def getSampleTests(beautifiedBaseObject): | ||
SamTestsBaseURLList=[] | ||
SamTestsFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Sample Tests' | ||
|
||
if not os.path.exists(SamTestsFolderPath): | ||
os.makedirs(SamTestsFolderPath) | ||
|
||
for link in beautifiedBaseObject.find_all('a'): | ||
if 'lecture' in link['href'] and "Sample" in link['href']: | ||
SamTestsBaseURLList.append(urljoin(baseURL,link['href'])) | ||
|
||
for URL in SamTestsBaseURLList: | ||
downloadFiles(URL,SamTestsFolderPath) | ||
|
||
|
||
def getHomework(beautifiedBaseObject): | ||
HomeworkBaseURLList=[] | ||
HomeworkFolderPath=os.path.dirname(os.path.abspath(__file__))+'\\Homework Assignments' | ||
|
||
if not os.path.exists(HomeworkFolderPath): | ||
os.makedirs(HomeworkFolderPath) | ||
|
||
for link in beautifiedBaseObject.find_all('a'): | ||
if 'homework' in link['href']: | ||
HomeworkBaseURLList.append(urljoin(baseURL,link['href'])) | ||
|
||
for URL in HomeworkBaseURLList: | ||
downloadFiles(URL,HomeworkFolderPath) | ||
|
||
def menu(): | ||
print("The list of choices are as follows. Choose the ones you like one by one. Don't worry you will get future chances (just as in life!! :-)): \n") | ||
print("1. All the lecture notes!\n") | ||
print("2. All the Sample test files!\n") | ||
print("3. All the Homework files!\n") | ||
print("q. Nothing fam. Just get me out of here!\n") | ||
|
||
def main(): | ||
print("Welcome to CSE232 Scraper!!!!!!\n") | ||
menu() | ||
userInput=(input("So what do you want (not in life. What do you want right now!!!): ")) | ||
print() | ||
beautifiedBaseObject=returnBeautifiedObject(baseURL,UNAME,PASSWORD) | ||
while(True): | ||
if(userInput=='1'): | ||
print("Alright lets get those Lecture files!!!\n") | ||
getLectures(beautifiedBaseObject) | ||
os.chdir(basePath) | ||
print() | ||
print('Alright the lecture notes are downloaded in the \"Lecture\" folder. See if you want anything else!\n' ) | ||
menu() | ||
userInput=(input("Do you need anything else????: ")) | ||
print('\n\n') | ||
elif(userInput=='2'): | ||
print("Alright lets get those Sample test files!!!\n") | ||
getSampleTests(beautifiedBaseObject) | ||
os.chdir(basePath) | ||
print() | ||
print('Alright the Sample test files are downloaded in the \"Sample Tests\" folder. See if you want anything else!\n' ) | ||
menu() | ||
userInput=(input("Do you need anything else????: ")) | ||
print('\n\n') | ||
elif(userInput=='3'): | ||
print("Alright lets get those Homework files!!!\n") | ||
getHomework(beautifiedBaseObject) | ||
os.chdir(basePath) | ||
print() | ||
print('Alright the Homework files are downloaded in the \"Homework Assignments\" folder. See if you want anything else!\n' ) | ||
menu() | ||
userInput=(input("Do you need anything else????: ")) | ||
print('\n\n') | ||
elif(userInput=='q'): | ||
print('\n\n') | ||
print("Alright fam! Take care! Be happy! Live long and prosper!!!!!!!!!!!!!") | ||
print('\n\n') | ||
break | ||
else: | ||
print("Seriously fam, Seriously. Pick one of the choices will ya!!\n") | ||
os.chdir(basePath) | ||
menu() | ||
print("Let's try this again\n") | ||
userInput=(input("Do you need anything else????: ")) | ||
print('\n\n') | ||
|
||
|
||
#These lines delete the scraper file. Uncomment them if you don't want the file to be deleted | ||
#os.remove(sys.argv[0]) | ||
#print("Scraper File Removed!!!!") | ||
print("\nPress a key to continue...") | ||
m.getch() | ||
if __name__ == "__main__": | ||
main() |