-
Notifications
You must be signed in to change notification settings - Fork 2
/
Scrapper.py
147 lines (132 loc) · 4.68 KB
/
Scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import json
import logging
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as bs
from mongodb import mongodbconnection
### Setting up Logging file ###
logging.basicConfig(filename="flask_logs.log",
format='%(asctime)s %(message)s', filemode='w', level=logging.DEBUG)
### Function to Scrap all course title from iNeuron website ###
def all_course():
"""
It scrapes the website and returns a list of all the courses available on the website.
Returns:
A list of all the courses available on the website.
"""
try:
ineuron_url = 'https://courses.ineuron.ai/'
uClient = uReq(ineuron_url)
ineuron_page = uClient.read()
uClient.close()
ineuron_html = bs(ineuron_page, 'html.parser')
course_data = json.loads(ineuron_html.find(
'script', {"id": "__NEXT_DATA__"}).get_text())
all_courses = course_data['props']['pageProps']['initialState']['init']['courses']
course_namelist = list(all_courses.keys())
return course_namelist
except:
logging.error('Error in scraping at all_course()')
### Function to Scrap one Course details from iNeuron website ###
def get_course(coursename):
"""
It takes a course name as input and returns a dictionary of all the details of the course.
Args:
coursename: The name of the course you want to scrape.
Returns:
A dictionary of course details
"""
ineuron_url = 'https://courses.ineuron.ai/'
uClient = uReq(ineuron_url + str(coursename).replace(" ", "-"))
course_page = uClient.read()
uClient.close()
ineuron_html = bs(course_page, 'html.parser')
course_data1 = json.loads(ineuron_html.find(
'script', {"id": "__NEXT_DATA__"}).get_text())
logging.info('Course data saved as JSON format')
all_dict = {}
# list = []
try:
try:
all_data = course_data1["props"]["pageProps"]
except:
all_data = 'No page'
try:
page_data = all_data['data']
except:
page_data = 'No data'
try:
detailed_data = page_data['details']
except:
detailed_data = 'No details'
try:
meta_data = page_data['meta']
except:
meta_data = 'No meta_data'
try:
curriculum_data = meta_data['curriculum']
except:
curriculum_data = 'No curriculum_data'
try:
overview_data = meta_data['overview']
except:
overview_data = 'No overview_data'
# Building a Course Dictionary
try:
pricing_inr = detailed_data['pricing']['IN']
except:
pricing_inr = 'NULL'
try:
course_name = page_data['title']
except:
course_name = 'Name NA'
try:
description = detailed_data['description']
except:
description = "NULL"
try:
language = overview_data['language']
except:
language = 'NULL'
try:
req = overview_data['requirements']
except:
req = 'NULL'
try:
learn = overview_data['learn']
except:
learn = 'NULL'
curriculum = []
try:
for i in curriculum_data:
curriculum.append(curriculum_data[i]["title"])
### Saving all the data in dictionary format ###
all_dict = {"Course_title": course_name, "Description": description,
"Language": language, "Pricing": pricing_inr,
"Curriculum_data": curriculum, "Learn": learn,
"Requirements": req}
logging.info('dict is created')
except:
curriculum.append('NULL')
return all_dict
except:
logging.error('Error in Scrapping at get_course()')
### Function which gets all the course data and saves it in mongodb ###
def scrap_all():
"""
It checks if the collection is present in the database, if it is, it does nothing, if it isn't, it
scrapes the data and inserts it into the database
"""
dbcon = mongodbconnection(username='mongodb', password='mongodb')
db_collection = dbcon.getCollection(
"iNeuron_scrapper", "course_collection")
try:
if dbcon.isCollectionPresent("iNeuron_scrapper", "course_collection"):
pass
else:
final_list = []
list_courses = all_course()
for i in list_courses:
final_list.append(get_course(i))
db_collection.insert_many(final_list)
except Exception as e:
logging.error("error in DB insertion", e)