-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpubmed_scraper.py
executable file
·228 lines (190 loc) · 10 KB
/
pubmed_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pandas as pd
import xml.etree.ElementTree as ET
import requests
import datetime
def pubmed_xml_parse(filename):
now = datetime.datetime.now()
### Parse XML File using an ElementTree
tree_ab = ET.parse(filename)
root_ab = tree_ab.getroot()
### These lists will contain lists where each list has data for 1 article
### Some will be for their own sheet
master_df = []
pub_type_df_list = []
abstract_df_list = []
artID_df_list = []
keyword_df_list = []
author_df_list = []
mesh_df_list = []
journal_list_df = []
if_cols = ['rank', 'full_journal_title', 'total_cites', 'journal_impact_factor', 'eigenfactor_score']
try:
impact_factors = pd.read_csv('data/journal_impact_factors.csv', sep = ',', header=None, encoding="latin-1", names=if_cols)
impact_factors = impact_factors.dropna()
except:
impact_factors = pd.DataFrame(columns=if_cols)
### For each article in the imported file
for article in root_ab.findall('./PubmedArticle'):
### These will be used to make a row in the `master_df`
uni_mesh_dict = {}
keyword_list = []
artID_list = []
abstract_list = []
journal_list = []
pub_type_list = []
author_list = []
### Iterate through different parts of the articles
### Publication Date
for PubMedPubDate in article.findall('./PubmedData/History/PubMedPubDate'):
### Grab data article was published on PubMed
if PubMedPubDate.get('PubStatus') == 'pubmed':
year = PubMedPubDate.findall('./Year')[0].text
month = PubMedPubDate.findall('./Month')[0].text
art_pubdate = month + '/' + year
### Link and PMID
PMID = article.find('./MedlineCitation/PMID').text
link_str = 'https://www.ncbi.nlm.nih.gov/pubmed/' + PMID
### Article Title
for title in article.findall('./MedlineCitation/Article/ArticleTitle'):
title_text = ' '.join(title.itertext())
### Publication Types
for type in article.findall('./MedlineCitation/Article/PublicationTypeList/PublicationType'):
pub_type_list.append(type.text)
pub_type_df_list.append([PMID, title_text, type.text])
### Journal Information
for journal in article.findall('./MedlineCitation/Article/Journal'):
try:
journal_title = journal.find('Title').text
journal_abbr = journal.find('ISOAbbreviation').text
journal_issn = journal.find('ISSN').text
journal_issn_type = journal.find('ISSN').get('IssnType')
journal_impact_factor_df = impact_factors[impact_factors.full_journal_title.str.contains(f'^(?i){journal_title}$')]
try:
journal_impact_factor = journal_impact_factor_df['journal_impact_factor'].iloc[0]
except:
journal_impact_factor = ''
journal_list = [journal_title, journal_issn, journal_issn_type, journal_abbr, journal_impact_factor]
journal_list_df.append([PMID, journal_title, journal_issn, journal_issn_type, journal_abbr, journal_impact_factor])
### Sometimes there's no ISSN so just in case that's the case :
except AttributeError:
journal_list = [journal_title, None, None, journal_abbr, '']
journal_list_df.append([PMID, journal_title, None, None, journal_abbr])
### Abstracts
for abstract in article.findall('./MedlineCitation/Article/Abstract/AbstractText'):
abstract_type = abstract.get('Label')
if abstract_type == None:
abstract_type = 'No Abstract Type Label'
abstract_text = abstract.text
abstract_list.append([abstract_type, abstract_text])
abstract_df_list.append([PMID, title_text, abstract_type, abstract_text])
### Author information
for author in article.findall('./MedlineCitation/Article/AuthorList/Author'):
try:
first_name = author.findall('./ForeName')[0].text
last_name = author.findall('./LastName')[0].text
author_text = first_name + last_name
except:
try:
author_text = author.findall('./CollectiveName')[0].text
except:
author_text = 'error'
author_list.append([author_text])
author_df_list.append([PMID, title_text, author_text])
### Article IDs and information
for ArtID in article.findall('./PubmedData/ArticleIdList/ArticleId'):
ArtID_text = ArtID.text
ArtID_type = ArtID.get('IdType')
if ArtID_type != 'pubmed':
artID_list.append([ArtID_type, ArtID_text])
artID_df_list.append([PMID, title_text, ArtID_type, ArtID_text])
else:
continue
### MeSH Headings and Terms
for MeshHeading in article.findall('./MedlineCitation/MeshHeadingList/MeshHeading'):
DescName = MeshHeading.findall('./DescriptorName')[0].text
mesh_df_list.append([PMID, title_text, '-' , DescName])
QualName_list = []
for QualName in MeshHeading.findall('./QualifierName'):
QualName_list.append(QualName.text)
mesh_df_list.append([PMID, title_text, QualName.text, DescName])
uni_mesh_dict.update({DescName:QualName_list})
### Other keywords attached to the article
for keyword_elem in article.findall('./MedlineCitation/KeywordList/Keyword'):
keyword = keyword_elem.text
keyword_signif = keyword_elem.get('MajorTopicYN')
keyword_list.append(keyword)
keyword_df_list.append([PMID, title_text, keyword])
### Master List
master_df.append([title_text, PMID, pub_type_list, journal_list, author_list, abstract_list, keyword_list, uni_mesh_dict, art_pubdate, artID_list, link_str])
### For book articles
for book in root_ab.findall('./PubmedBookArticle'):
### These will be used to make a row in the `master_df`
uni_mesh_dict = {}
keyword_list = []
artID_list = []
abstract_list = []
book_list = []
pub_type_list = []
author_list = []
journal_list_df.append([PMID, 'Book', 'Book', 'Book', 'Book'])
for PubMedPubDate in book.findall('./PubmedBookData/History/PubMedPubDate'):
if PubMedPubDate.get('PubStatus') == 'pubmed':
year = PubMedPubDate.findall('./Year')[0].text
month = PubMedPubDate.findall('./Month')[0].text
art_pubdate = month + '/' + year
PMID = book.find('./BookDocument/PMID').text
link_str = 'https://www.ncbi.nlm.nih.gov/pubmed/' + PMID
for Book in book.findall('./BookDocument/Book'):
book_title = Book.find('BookTitle').text
pub_name = Book.find('Publisher/PublisherName').text
book_list = [book_title, pub_name]
for type in book.findall('./BookDocument/PublicationType'):
pub_type_list.append(type.text)
pub_type_df_list.append([PMID, book_title, type.text])
for abstract in book.findall('./BookDocument/Abstract/AbstractText'):
abstract_type = abstract.get('Label')
if abstract_type == None:
abstract_type = ''
abstract_list.append([abstract_type, abstract.text])
abstract_df_list.append([PMID, book_title, abstract_type, abstract_text])
for author in article.findall('./BookDocument/AuthorList/Author'):
author_text = author.text
author_list.append([author_text])
author_df_list.append([PMID, book_title, author_text])
for ArtID in book.findall('./PubmedBookData/ArticleIdList/ArticleId'):
ArtID_text = ArtID.text
ArtID_type = ArtID.get('IdType')
if ArtID_type != 'pubmed':
artID_list.append([ArtID_type, ArtID_text])
artID_df_list.append([PMID, book_title, ArtID_type, ArtID_text])
else:
continue
for keyword_elem in book.findall('./BookDocument/KeywordList/Keyword'):
keyword = keyword_elem.text
keyword_list.append(keyword)
keyword_df_list.append([PMID, book_title, keyword])
### Adds row with this article's data to master_df
master_df.append([book_title, PMID, pub_type_list, 'Book', author_list, abstract_list, keyword_list, 'No mesh for books', art_pubdate, artID_list, link_str])
### Master DF creation
master_df = pd.DataFrame(master_df, columns=['title', 'pmid', 'pub_type_list', 'journal_info_list', 'author_list', 'abstract_list', 'keyword_list', 'mesh list', 'pubdate', 'artid_list', 'link'])
### Individual sheets with data
kw_df = pd.DataFrame(keyword_df_list, columns=['pmid', 'title', 'keyword'])
artid_df = pd.DataFrame(artID_df_list, columns=['pmid', 'title', 'type', 'ID'])
abs_df = pd.DataFrame(abstract_df_list, columns=['pmid', 'title', 'type', 'abstract'])
pubt_df = pd.DataFrame(pub_type_df_list, columns=['pmid', 'title', 'pub_type'])
mesh_df = pd.DataFrame(mesh_df_list, columns=['pmid', 'title', 'qual', 'desc'])
author_df = pd.DataFrame(author_df_list, columns=['pmid', 'title', 'author name'])
### Create Excel document and all sheets
xlsx_file_name = filename.replace('/xml', '')[:-10] + '_' + str(len(master_df)) + 'res' + '.xlsx'
writer = pd.ExcelWriter(xlsx_file_name)
### Add sheets
master_df.to_excel(writer, 'Master Table')
author_df.to_excel(writer, 'Author List (Long)')
kw_df.to_excel(writer, 'Keyword List (Long)')
artid_df.to_excel(writer, 'Article ID List (Long)')
abs_df.to_excel(writer, 'Abstract List (Long)')
pubt_df.to_excel(writer, 'Pubtype List (Long)')
mesh_df.to_excel(writer, 'MeSH Keyword List (Long)')
writer.save()
return_string = '\tFile name: ' + xlsx_file_name + '\n\tLength of result: ' + str(len(master_df))
return return_string