-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsearch_ids.py
executable file
·78 lines (57 loc) · 2.67 KB
/
search_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import os
import pandas as pd
import xml.etree.ElementTree as ET
import requests
import datetime
import csv
def get_article_ids(query, filename, retmax, sort, have_ids=False, api_key=""):
now = datetime.datetime.now()
if os.path.isdir("output") == False:
os.mkdir('output')
if os.path.isdir("output/xml") == False:
os.mkdir('output/xml')
### If `have_ids` == TRUE, user has a list of IDs saved at `filename` already
if (have_ids == True):
epost_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi?id='
### Read in the file IDs
with open(filename, 'r') as f:
uid_list = list(csv.reader(f))
### Create the URL to query
url_post = epost_base
for id in uid_list:
url_post = url_post + id[0] + ','
### Get webpage with QK and WebEnv values for fetching articles
docsearch_resp = requests.get(url_post)
### Set to blank because they don't apply with ID list
ret_max, query_str = '', ''
### Build filename for the _fetch .XML file
file_name_fetch = 'output/xml/' + filename[:-4] + '_idlist_' + now.strftime("%y%m%d_%H%M") + '_fetch.xml'
### If have_ids != TRUE, then user wants to query using a search term/phrase
else:
query_str = query
# file_name_search = query + now.strftime("%y%m%d_%H%M") + '_search.xml'
file_name_fetch = 'output/xml/' + query + '_' + now.strftime("%y-%m-%d-%H%M") + '_fetch.xml'
esearch_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
### More DB options here : https://www.ncbi.nlm.nih.gov/books/NBK3837/
db = '?db=' + 'pubmed'
query = '&term=' + query
hist_api = "&usehistory=y"
ret_max = '&retmax=' + str(retmax)
sort = '&sort=' + sort
### Get the webpage with the IDs for the articles you'll want to fetch
url_search = esearch_base + db + query + hist_api + ret_max + sort
docsearch_resp = requests.get(url_search)
### Search the results
root_search = ET.fromstring(docsearch_resp.content)
#root_search = tree_search.getroot()
QK = "&query_key=" + root_search.findall('./QueryKey')[0].text
WE = "&WebEnv=" + root_search.findall('./WebEnv')[0].text
### Get Abstracts with efetch
efetch_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
rettype_mode = "&rettype=abstract&retmode=xml"
url_ab = efetch_base + '?db=pubmed' + QK + WE + rettype_mode + ret_max
docsab_resp = requests.get(url_ab)
with open(file_name_fetch, 'wb') as f:
f.write(docsab_resp.content)
ret_list = [file_name_fetch, query_str]
return ret_list