-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_gemibook.py
207 lines (177 loc) · 7.07 KB
/
scrape_gemibook.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import re
# helper function to get info from main book page
# input: URL for the book page in gemibooks.com
# output: book title, book author, number of chapters/pages, url to 1st page
def get_book_info(URL):
response = requests.get(URL)
contents = response.text
with open('Bookswebpage.html', 'w') as f:
f.write(contents)
doc = BeautifulSoup(contents, 'html.parser')
title_tag = doc.find('h3', class_='title')
title = title_tag.text
author_tag = doc.find('a', itemprop='author')
author = author_tag.text
all_ul = doc.find_all('ul', class_='list-chapter')
page_urls = []
for ul in all_ul:
page_tags = ul.find_all('li')
for page_tag in page_tags:
page_link = page_tag.find('a', href=True)
page_url = 'https://gemibook.com' + page_link['href']
page_urls.append(page_url)
return title, author, page_urls
get_book_info('https://gemibook.com/231064-the-hobbit')
get_book_info('https://gemibook.com/2435427-the-martian')
# helper function that gets the page from url
# input: page url
# output: page document
def get_doc(url):
response = requests.get(url)
doc = BeautifulSoup(response.text, 'html.parser')
if response.status_code != 200:
print('url didn\'t work', url)
return None
# raise Exception('Failed to load page {}'.format(response))
return doc
# helper function that finds all times using regex
# input: book content to search through
# output: all matches of the time in an array
def find_times(content):
# regexp = '(24:00|2[0-3]:[0-5][0-9]|1[0-9]:[0-5][0-9]|[0-9]:[0-5][0-9])'
regexp12 = '1[0-9]:[0-5][0-9]\s?[AaPp]\.?[Mm]\.?|[1-9]:[0-5][0-9]\s?[AaPp]\.?[Mm]\.?'
regexp24 = '24:00|2[0-3]:[0-5][0-9]|1[0-9]:[0-5][0-9]|[0-9]:[0-5][0-9]'
regexp = re.compile(regexp12+'|'+regexp24)
result = re.findall(regexp, content)
return result
# sub helper function that gets the index of the sentence containing the given time
# input: sentence array, match (time)
# output: index of the sentence containing the time, else -1
def get_sentence_index(sentences, match):
for i in range(len(sentences)):
index = sentences[i].find(match)
if index != -1:
return i
return -1
# sub helper function that splits a paragraph into sentences
# input: paragraph string
# output: array of sentences
def split_text(text):
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"
text = " " + text + " "
text = text.replace("\n"," ")
text = re.sub(prefixes,"\\1<prd>",text)
text = re.sub(websites,"<prd>\\1",text)
if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
if "”" in text: text = text.replace(".”","”.")
if "\"" in text: text = text.replace(".\"","\".")
if "!" in text: text = text.replace("!\"","\"!")
if "?" in text: text = text.replace("?\"","\"?")
text = text.replace(".",".<stop>")
text = text.replace("?","?<stop>")
text = text.replace("!","!<stop>")
text = text.replace("<prd>",".")
sentences = text.split("<stop>")
sentences = sentences[:-1]
sentences = [s.strip() for s in sentences]
return sentences
# helper function that shortens quotes if the text is too long
# input: text chunk, array of time matches
# output: array of shortened quotes with length = len(matches)
def handle_long_text(text_str, matches):
quotes = []
sentences = split_text(text_str)
for match in matches:
sentence_index = get_sentence_index(sentences, match)
start = 0 if sentence_index - 3 < 0 else sentence_index - 3
end = len(sentences) - 1 if sentence_index + 3 > len(sentences) else sentence_index + 3
sentence_array = sentences[start:end+1]
quote = ' '.join(sentence_array)
quotes.append(quote)
return quotes
# main function that scrapes a book for times
# input: url to main page of book
# output: DataFrame with times, book info, and quotes
def scrape_book(URL):
title, author, urls = get_book_info(URL)
num_pages = len(urls)
times = []
page_numbers = []
quotes = []
for i in range(num_pages):
page_url = urls[i]
doc = get_doc(page_url)
if doc == None:
print('error retrieving website', title, i, page_url)
continue
content = doc.find('div', class_='chapter-content')
if content == None:
print('no content', title, i, page_url)
continue
texts = content.find_all('p')
for text in texts:
text_str = text.text
matches = find_times(text_str)
if matches:
times.extend(matches)
if len(text_str) > 500:
quotes.extend(handle_long_text(text_str, matches))
else:
quote = text_str
quotes.extend([quote]*len(matches))
page_numbers.extend([i+1]*len(matches))
titles = [title] * len(times)
authors = [author] * len(times)
book_dict1 = {
'TIME24': times,
'TITLE': titles,
'AUTHOR': authors,
'TIMES': times,
'PAGE NUMBER': page_numbers,
'QUOTE': quotes
}
return pd.DataFrame(book_dict1)
def test_func():
df = scrape_book('https://gemibook.com/241033-foundation')
df['TIME24'] = pd.to_datetime(df['TIME24'], infer_datetime_format=True).dt.time
df = df.sort_values(by="TIME24")
df = df.drop_duplicates(subset=['QUOTE'], keep=False)
return df
# test_func()
# .to_csv('test-4.csv', index=None)
# main function that scrapes a number of books
# opens a csv file containing links to the book urls
# returns a sorted DataFrame of all found time matches
def scrape_gemi_main():
with open('gemibook_urls.csv') as f:
urls = [line.rstrip() for line in f]
dfs = []
for url in urls:
if url:
df = scrape_book(url)
dfs.append(df)
else:
print('error at url ', url)
dfs = pd.concat(dfs)
dfs['TIME24'] = pd.to_datetime(dfs['TIME24'], infer_datetime_format=True).dt.time
dfs = dfs.sort_values(by="TIME24")
dfs = dfs.drop_duplicates(subset=['QUOTE'], keep=False)
dfs['PAGE NUMBER'] = dfs['PAGE NUMBER'].astype(int)
return dfs
scrape_gemi_main().to_csv('main-test-6.csv', index=None)