-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
219 lines (178 loc) · 7.23 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
import random
import requests
from bs4 import BeautifulSoup, NavigableString
import re
from paper import Paper
def scrapeLingBuzzHomePage(number_of_paper):
"""Scrapes LingBuzz homepage for new papers to extract title, link to paper,
authors, abstract, and keywords. Returns a Paper object."""
# Get LingBuzz homepage
homepage = requests.get('https://ling.auf.net/lingbuzz/')
soup = BeautifulSoup(homepage.content, 'html.parser')
# Sequentially work down to the table that stores first page of papers
html = list(soup.children)[1]
body = list(html.children)[1]
main_table = list(body.children)[2]
tbody = list(main_table.children)[0]
tr = list(tbody.children)[0]
td_1 = list(tr.children)[0]
# Store html table of entire first page of papers in recent_papers_table
# Each element in this list is of class 'bs4.element.Tag'
# Each element (paper) is a <tr>
# Each <tr> is comprised of 4 <td> tags containing: Authors, Newness, PDF link, Title
recent_papers_table = list(td_1.children)
n = number_of_paper # number of the paper to find
# Authors
authors = []
authors_td = list(list(recent_papers_table[n].children)[0].children)
for tag in authors_td:
if tag.name == 'a':
authors.append(tag.get_text())
# Newness / year
newness_td = list(list(recent_papers_table[n].children)[1].children)[0]
if isinstance(newness_td, NavigableString):
date = str(newness_td)
else:
date = str(list(newness_td.children)[0])
date = date.split('-')[0]
# PDF link
pdf_td = list(list(recent_papers_table[n].children)[2].children)[0]
pdf_link = 'https://ling.auf.net' + pdf_td['href']
# Link to summary
summary_td = list(list(recent_papers_table[n].children)[3].children)[0]
summary_link = 'https://ling.auf.net' + summary_td['href']
# Title
title = summary_td.get_text()
# Abstract
# Use summary link to get a paper's page
page = requests.get(summary_link)
soup = BeautifulSoup(page.content, 'html.parser')
# Sequentially work down to the paper's abstract
html = list(soup.children)[1]
body = list(html.children)[1]
# The abstract is at the 5th index of the body's children list
abstract = str(list(body.children)[5])
# Keywords
try:
keywords_tr = list(list(body.children)[6].children)[3]
keywords_list_td = list(keywords_tr.children)[1]
keywords = keywords_list_td.get_text()
keywords = re.split(r'[,|;]', keywords)
keywords = [k.strip() for k in keywords]
except:
# return dummy paper when keyword list parsing doesn't behave
title='dummy'
pdf_link='dummy'
authors=['dummy']
abstract='dummy'
keywords = ['dummy']
date='dummy'
current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
return current_paper
# Construct Paper object
current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
return current_paper
def queryLingBuzz(query):
"""Takes a query and returns a list of Paper objects resulting from that
query on LingBuzz.
Parameters
----------
query : string
The string to query LingBuzz with.
Returns
-------
list
List of Paper objects.
"""
# Get LingBuzz search results page according to `query`
page = requests.get(f'https://ling.auf.net/lingbuzz/_search?q={query}')
soup = BeautifulSoup(page.content, 'html.parser')
# Sequentially work down to the table that stores first page of papers
html = list(soup.children)[1]
body = list(html.children)[1]
main_table = list(body.children)[0]
# Check if query returned 'nothing found' and return empty list if so
if str(list(list(main_table.children)[0].children)[0]) == 'nothing found':
print('Results: nothing found')
return []
# Store html table of entire first page of papers in main_table
# Each element in this list is of class 'bs4.element.Tag'
# Each element (paper) is a <tr>
# Each <tr> is comprised of 4 <td> tags containing: NULL, Authors, Newness, Title (link to summary)
#n = 3 # number of the paper to find
collected_papers = []
# Iterate through table of entire search query results
for n in range(len(list(main_table))):
# Authors
authors = []
authors_td = list(list(list(main_table.children)[n].children)[0].children)[0]
for tag in authors_td:
if tag.name == 'a':
authors.append(tag.get_text())
# Year
date = None
date_td = list(list(list(main_table.children)[n].children)[0].children)[1]
if isinstance(date_td, NavigableString):
pass
else:
date = list(date_td.children)[0].strip('(').strip(')')
# Link to summary
summary_td = list(list(list(list(main_table.children)[n].children)[0].children)[2].children)[0]
summary_link = 'https://ling.auf.net' + summary_td['href']
# Title
title = summary_td.get_text()
# Abstract
# Use summary link to get a paper's page
page = requests.get(summary_link)
soup = BeautifulSoup(page.content, 'html.parser')
# Sequentially work down to the paper's abstract
html = list(soup.children)[1]
body = list(html.children)[1]
# The abstract is at the 5th index of the body's children list
abstract = str(list(body.children)[5])
# PDF link
# I don't know why I had to add this error catching... certain paper summary pages
# aren't formatted consistently? The ones from 'semantics archive'
try:
pdf_tr = list(list(body.children)[6].children)[0]
except (IndexError, AttributeError):
continue
# Catch a potential nonexistent PDF link in summary page (and skip current iteration / paper)
try:
link_a = list(list(pdf_tr.children)[1].children)[1]
except AttributeError:
continue
pdf_link = 'https://ling.auf.net' + link_a['href']
# Keywords
keywords_tr = list(list(body.children)[6].children)[3]
keywords_list_td = list(keywords_tr.children)[1]
keywords = keywords_list_td.get_text()
keywords = re.split(r'[,|;]', keywords)
keywords = [k.strip() for k in keywords]
# Construct Paper object
current_paper = Paper(title, pdf_link, authors, abstract, keywords, date)
collected_papers.append(current_paper)
return collected_papers
def classifier(text):
"""Returns a random (for now) binary classification value for a given text.
Parameters
----------
text : the text to be classified
Returns
-------
bool
"""
return random.choice([True, False])
# Tests
#current_paper = scrapeLingBuzzHomePage()
#print(current_paper.title)
#print(current_paper.link)
#print(current_paper.authors)
#print(current_paper.abstract)
#print(current_paper.keywords)
#print(current_paper.date)
#collected_papers = queryLingBuzz('pokemon')
#for x in collected_papers:
#print(x.date)
#y = queryLingBuzz('That’s a Curious Copular Construction You Have There!')
#print(y[0].title, y[0].keywords, y[0].abstract, y[0].link, y[0].authors, y[0].date)