-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_scholar.py
38 lines (34 loc) · 1.34 KB
/
scrape_scholar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import subprocess
def run_scrape(list_of_keywords):
authors_to_search = {}
for keyword in list_of_keywords:
bashCommand = "python scholar.py -A " + keyword + " -c 1 --citation=en"
process = subprocess.Popen(bashCommand.split(), stdout=subprocess.PIPE)
output, error = process.communicate()
authors = parse_output(keyword, output)
authors_to_search[keyword] = authors
return authors_to_search
def parse_output(keyword, output):
articles = output.split('\n\n')
authors = []
for article in articles:
authors_pre_processed = article.split("%A ")
num_authors = len(authors_pre_processed)
for i, author in enumerate(authors_pre_processed):
# ignore the first and last indices as they are not authors
if i == 0:
continue
# the last index needs to further process
if i == num_authors - 1:
last_author = author.split("%")
author = last_author[0]
author = author[:-4]
authors.append(author)
break
author = author[:-4]
authors.append(author)
return authors
def scrape():
user_input = raw_input("Input keywords:(delimited by ;) ")
list_of_keywords = user_input.split(";")
return run_scrape(list_of_keywords)