-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbrainscrape.py
40 lines (33 loc) · 1.34 KB
/
brainscrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# BrainyQuote Web Scraper (By Keyword)
# SPECIAL POMMUNISM EDITION
# Alaina Kafkes
import requests
from bs4 import BeautifulSoup
example_key = "communism"
def getQuotes(keyword=example_key, numpages=7):
"""
Given a keyword and the number of HTML pages of quotes to parse, uses Requests & BeautifulSoup to obtain (quote, author) tuples from BrainyQuote.
Returns list of (quote, author) tuples and the length of this list.
"""
# Initialize lists
quoteArray = []
authorArray = []
pageNameArray = [keyword]
for i in range(2,numpages+1):
pageNameArray.append(keyword + "_" + str(i))
# For every page pertaining to a topic
for page in pageNameArray:
# Obtain BrainyQuote page html
base_url = "http://www.brainyquote.com/quotes/keywords/"
url = base_url + keyword + ".html"
response_data = requests.get(url).text[:]
soup = BeautifulSoup(response_data, 'html.parser')
# Populate quoteArray
for item in soup.find_all("span", class_="bqQuoteLink"):
quoteArray.append(item.get_text().rstrip())
# Populate authorArray
for item in soup.find_all("div", class_="bq-aut"):
authorArray.append(item.get_text())
# Create list of tuples of the form (quote, author)
ans = zip(quoteArray, authorArray)
return ans, len(ans)