-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxml2bib.py
107 lines (89 loc) · 3.03 KB
/
xml2bib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env python
# coding: utf-8
from __future__ import print_function
import sys
import xml.etree.ElementTree as ET
"""
Need to convert some accented Unicode characters to their LaTeX forms.
Note the big X, not a chi, in the class name BibTeXWriter.
It is possible that {, " or $ need escaping with backslash ('\').
"""
class BibTeXWriter():
def __init__(self, tag, data, entry_type='Misc', encoding='UTF-8'):
self.entry_type = entry_type
self.tag = tag
self.data = data
self.enc = encoding
def __str__(self):
# LaTex makes accented chars differently.
replacements = {
u'ä': u'\\"{a}', u'Ä': u'\"{A}',
u'ö': u'\\"{o}', u'Ö': u'\"{O}',
u'ü': u'\\"{u}', u'Ü': u'\"{U}',
}
lines = []
for k in sorted(self.data):
v = self.data[k]
if v is None:
continue
try:
value = int(v)
except ValueError:
for x in replacements:
v = v.replace(x, replacements[x])
value = '{' + v.encode(self.enc) + '}'
lines.append(' {} = {}'.format(k, value))
return '@{0}{{{1},\n'.format(self.entry_type, self.tag,) + \
',\n'.join(lines) + '\n}'
class XMLReader():
def __init__(self, filename):
try:
tree = ET.parse(filename)
except ET.ParseError:
print('ParseError while processing', filename, file=sys.stderr)
raise
self.root = tree.getroot()
def xml2dict(x):
"""
Keys of the dictionary are the fields conventionally used by bibtex.
"""
r = x.root
#print('DEBUG', r.attrib)
xmlfields = ('creator', 'identifier', 'title', 'publisher',
'publicationYear', 'resourceType')
#nsd = {'dc': 'http://datacite.org/schema/kernel-4'}
ns = '{http://datacite.org/schema/kernel-4}'
dc = dict.fromkeys(xmlfields)
for field in xmlfields:
#for n in r.findall('dc:' + field, nsd):
for n in r.iter(ns + field):
v = n.text
if field == 'creator':
v = n.find(ns + 'creatorName').text
v = v.strip()
#print('Found', field, v)
if n is not None:
if dc[field] is None:
dc[field] = v
else:
dc[field] = dc[field] + ' and ' + v
#print('DEBUG dc=', dc)
d = dict()
dc_to_bib = {'author': 'creator',
'year': 'publicationYear',
'DOI': 'identifier',
'howpublished': 'resourceType',
'publisher': 'publisher',
'title': 'title',
}
for x in dc_to_bib:
d[x] = dc[dc_to_bib[x]]
return d
if __name__ == '__main__':
# Process all input files given on command line
for k in range(1, len(sys.argv)):
f = sys.argv[k]
x = XMLReader(f)
y = xml2dict(x)
print(str(BibTeXWriter('ref%i' % (k,), y)))
print()