-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprj2.py
89 lines (85 loc) · 7.17 KB
/
prj2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
from selenium import webdriver
import re
from time import sleep
import xlsxwriter
from tinydb import TinyDB
db = TinyDB('db.json')
workbook = xlsxwriter.Workbook('result1.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write_string(0, 0, 'Publisher Name')
worksheet.write_string(0, 1, 'Journal Title')
worksheet.write_string(0, 2, 'Issn')
worksheet.write_string(0, 3, 'Volume')
worksheet.write_string(0, 4, 'Issue')
worksheet.write_string(0, 5, 'epublish')
worksheet.write_string(0, 6, 'Article Title')
worksheet.write_string(0, 7, 'Vernacular Title')
worksheet.write_string(0, 8, 'First Page')
worksheet.write_string(0, 9, 'Last Page')
worksheet.write_string(0, 10, 'ELocationID pii')
worksheet.write_string(0, 11, 'ELocationID doi')
worksheet.write_string(0, 12, 'Language')
worksheet.write_string(0, 13, 'Authors')
worksheet.write_string(0, 14, 'Publication Type')
worksheet.write_string(0, 15, 'received date')
worksheet.write_string(0, 16, 'Abstract')
worksheet.write_string(0, 17, 'FA Abstarct')
worksheet.write_string(0, 18, 'pdf link')
row = 1
url = 'http://dam.journal.art.ac.ir/'
driver = webdriver.Chrome()
driver.get(url)
ik = driver.find_element_by_xpath('/html/body/div[1]/div/a').click()
title = driver.find_elements_by_css_selector('.fa-plus')
for i in title:
i.click()
sleep(1)
links = driver.find_elements_by_css_selector('.issue_dv a')
xml_link_list = []
for i in links:
page_link = i.get_attribute('href')
xml_link_code = re.findall(r'.+_(\d+).', page_link)[0]
xml_link = 'http://dam.journal.art.ac.ir/?_action=xml&issue=' + xml_link_code
xml_link_list.append(xml_link)
for i in xml_link_list:
driver.get(i)
source = driver.page_source
PublisherName = re.findall(r'<PublisherName>(.+)<', source)
JournalTitle = re.findall(r'<JournalTitle>(.+)<', source)
Issn = re.findall(r'<Issn>(.+)<', source)
Volume = re.findall(r'<Volume>(.+)<', source)
Issue = re.findall(r'<Issue>(.+)<', source)
epublish = re.findall(r'\"epublish\">\n.+>(\d+).*\n.+>(\d+).*\n.+>(\d+).*', source)
epublish = list(map(lambda x: ' '.join(x), epublish))
ArticleTitle = re.findall(r'<ArticleTitle>(.[\S\s]+?)</', source)
ArticleTitle = list(map(lambda x: x.replace('\n', ''), ArticleTitle))
VernacularTitle = re.findall(r'<VernacularTitle>(.[\S\s]+?)</', source)
VernacularTitle = list(map(lambda x: x.replace('\n', ''), VernacularTitle))
FirstPage = re.findall(r'<FirstPage>(\d+)</', source)
LastPage = re.findall(r'<LastPage>(\d+)</', source)
ELocationID_pii = re.findall(r'\"pii.+?(\d+)</', source)
ELocationID_doi = re.findall(r'\"doi\">(.+)</', source)
Language = re.findall(r'<Language>(.+)</', source)
AuthorList = re.findall(r'<AuthorList>[\s\S]+?</AuthorList>', source) # don't use
FirstName = list(map(lambda x: re.findall(r'<FirstName>(.+)<', x), AuthorList)) # don't use
LastName = list(map(lambda x: re.findall(r'<LastName>(.+)<', x), AuthorList)) # don't use
Affiliation = list(map(lambda x: re.findall(r'<Affiliation>([\s\S]+?)</Affiliation>', x), AuthorList)) # don't use
Affiliation = list(map(lambda x: [y.replace('\n', '') for y in x], Affiliation))
FN_LN_Aff = []
for FN, LN, Aff in zip(FirstName, LastName, Affiliation):
author = []
for m, n, p in zip(FN, LN, Aff):
author.append("%s %s: %s" % (m, n, p))
FN_LN_Aff.append(" -- ".join(author))
# use FN_LN_Aff
PublicationType = re.findall(r'<PublicationType>(.+)</', source)
received_date = re.findall(r'\"received\">\n.+?(\d+).+\n.+?(\d+).+\n.+?(\d+).+', source)
received_date = list(map(lambda x: ' '.join(x), received_date))
Abstract = re.findall(r'<Abstract>([\s\S]+?)</', source)
Abstract = list(map(lambda x: x.replace('<br /><strong> </strong>', ' ').replace('\n', ' ').replace('Abstract', ' ').replace('<br />', ' ').replace('<span class="fontstyle0">', ' ').replace('<span style="font-family: TimesNewRomanPSMT; font-size: 10pt; color: #231f20; font-style: normal; font-variant: normal;">', ' ').replace('<em><span style="font-family: ARTUNIV-Italic;">', ' ').replace('<strong> </strong> ', ' ').replace('</span>', ' ').replace('<span style="font-family: ARTUNIV; font-size: xx-small;"><span style="font-family: ARTUNIV; font-size: xx-small;"> <span style="font-family: ARTUNIV;">', ' ').replace('</em><span style="font-family: ARTUNIV;">', ' ').replace('<sub>', ' ').replace('<em>', ' ').replace('<br style="font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; orphans: 2; text-align: -webkit-auto; text-indent: 0px; text-transform: none; white-space: normal; widows: 2; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px;" />', ' ').replace('</sub>', ' ').replace(';/em>', ' ').replace('<strong>.</strong>', ' ').replace('</em>', ' ').replace('&', ' ').replace('<br class="Apple-interchange-newline" />', ' ').replace('<', ' ').replace(';span style="font-family: ARTUNIV;">»', ' ').replace(';span style="font-family: ARTUNIV; font-size: xx-small;"> ;span style="font-family: ARTUNIV; font-size: xx-small;"> ;span style="font-family: ARTUNIV;">', ' '), Abstract))
FA_Abstract = re.findall(r'<OtherAbstract Language=\"FA\">([\s\S]+?)</', source)
FA_Abstract = list(map(
lambda y: y.replace('<strong><em>', ' ').replace('</em></strong>', ' ').replace('\n', ' ').replace('<br />', ' ').replace('<span class="fontstyle0">', ' ').replace('<span style="font-family: TimesNewRomanPSMT; font-size: 10pt; color: #231f20; font-style: normal; font-variant: normal;">', ' ').replace('<em><span style="font-family: ARTUNIV-Italic;">', ' ').replace('<strong> </strong> ', ' ').replace('</span>', ' ').replace('<span style="font-family: ARTUNIV; font-size: xx-small;"><span style="font-family: ARTUNIV; font-size: xx-small;"> <span style="font-family: ARTUNIV;">', ' ').replace('</em><span style="font-family: ARTUNIV;">', ' ').replace('<sub>', ' ').replace('<em>', ' ').replace('<br style="font-style: normal; font-variant: normal; font-weight: normal; letter-spacing: normal; line-height: normal; orphans: 2; text-align: -webkit-auto; text-indent: 0px; text-transform: none; white-space: normal; widows: 2; word-spacing: 0px; -webkit-text-size-adjust: auto; -webkit-text-stroke-width: 0px;" />', ' ').replace('</sub>', ' ').replace(';/em>', ' ').replace('<strong>.</strong>', ' ').replace('</em>', ' ').replace('&', ' ').replace('<br class="Apple-interchange-newline" />', ' ').replace('<', ' ').replace(';span style="font-family: ARTUNIV;">»', ' ').replace(';span style="font-family: ARTUNIV; font-size: xx-small;"> ;span style="font-family: ARTUNIV; font-size: xx-small;"> ;span style="font-family: ARTUNIV;">', ' ').replace(';sup>3 ;/sup>', ' '), FA_Abstract))
pdf_link = re.findall(r'pdf\">([\s\S]+?pdf)', source)
print("#############################NEXT LINK#################################")
sleep(2)