forked from gbif/rs.gbif.org
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsync-extensions.py
executable file
·183 lines (163 loc) · 6.04 KB
/
sync-extensions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
# ****************************************
# +++ Registry Updater for
# +++ Extensions and Vocabularies
# ****************************************
import sys, string, urllib, traceback, os, json, datetime
from xml.etree.ElementTree import ElementTree
RS_BASE="/var/www/rs.gbif.org/"
NS_DC="http://purl.org/dc/terms/"
NS_EXT="http://rs.gbif.org/extension/"
# default issued date
MIN_DATE = datetime.date(datetime.MINYEAR, 1, 1)
class Extension:
def __init__(self):
self.identifier=None
self.url = None
self.title = None
self.description = None
self.subject = None
self.issued=None
self.isLatest=False
def __repr__(self):
return """EXT %s Issued:%s (latest=%s) >>%s<< %s [%s]""" % (self.identifier,self.issued,self.isLatest,self.title,self.description,self.subject)
class Vocabulary:
def __init__(self):
self.identifier = None
self.url = None
self.title = None
self.description = None
self.subject = None
self.issued = None
self.isLatest=False
def __repr__(self):
return """VOC %s Issued:%s (latest=%s) >>%s<< %s [%s] """ % (self.identifier,self.issued,self.isLatest,self.title,self.description,self.subject)
def writeExtensions(dir, urls):
f = open(dir + 'extensions.json', 'w')
processUrls(f, urls, 'extensions')
f.close()
def writeVocabs(dir, urls):
f = open(dir + 'vocabularies.json', 'w')
processUrls(f, urls, 'thesauri')
f.close()
def processUrls(fp, urls, rootElement):
"""Retrieve a list of objects by their url, sort them by their issued
date, update each object indicating if it is the latest issued or
not, and write each object to the JSON file"""
fp.write('{"%s":[\n' % rootElement)
allObjects = []
for url in urls:
print "Processing %s" % url
obj = parseUrl(url)
allObjects.append(obj)
# sort by issued date, starting with newest dates
allObjects = sorted(allObjects, key=getIssuedDate, reverse=True)
# iterate through objects and indicate whether it is the latest or not
identifiers = []
for obj in allObjects:
if (obj.identifier is not None and obj.identifier not in identifiers):
identifiers.append(obj.identifier)
obj.isLatest=True
else:
print 'The extension or vocabulary with URL %s issued %s is deprecated or superseded by one in production' % (obj.url, obj.issued)
# write each object to the JSON file
first = True;
for obj in allObjects:
if (not first):
fp.write(',\n')
json.dump(obj.__dict__, fp, default=json_serial)
first = False;
fp.write('\n]}')
return allObjects
def getIssuedDate(x):
"""Return the issued date, using default if issued date was None"""
return x.issued or MIN_DATE
def json_serial(obj):
"""JSON serializer for objects not serializable by default json code
For datime.date objects, return ISO format, e.g. yyyy-mm-dd
"""
if isinstance(obj, datetime.date):
serial = obj.isoformat()
return serial
def parseUrl(url):
"""Download the XML document at a given URL. Parse the XML and
construct either an Extension or Vocabulary depending on the
contents of the XML document. At the end, return the object
constructed"""
try:
f = urllib.urlopen(url)
tree = ElementTree()
tree.parse(f)
f.close()
doc = tree.getroot()
if (doc.tag=="{%s}extension"%NS_EXT):
obj=Extension()
obj.identifier=doc.attrib.get('rowType')
else:
obj=Vocabulary()
obj.identifier=doc.attrib.get('{%s}URI'%NS_DC)
obj.url=url
obj.title=doc.attrib.get('{%s}title'%NS_DC)
obj.description=doc.attrib.get('{%s}description'%NS_DC)
obj.subject=doc.attrib.get('{%s}subject'%NS_DC)
# convert YYYY-MM-DD string date into datetime.date object
strDate=doc.attrib.get('{%s}issued'%NS_DC)
if (strDate is not None):
obj.issued=datetime.datetime.strptime(strDate, "%Y-%m-%d").date()
return obj
except:
print "Oops, cant parse url %s" % url
print '-'*60
traceback.print_exc(file=sys.stdout)
print '-'*60
return None
def listExtensions(basedir, baseurl):
urls = []
print "WALK DIR "+basedir
for fn in os.listdir(basedir):
if fn.startswith("."):
continue
p=os.path.join(basedir,fn)
if os.path.isdir(p):
urls.extend( listExtensions(basedir+fn+"/", baseurl+fn+"/") )
else:
if (fn.lower().endswith(".xml")):
url = baseurl+fn
print " found extension at "+url
urls.append(url)
return urls
def listExternal(basedir):
return json.load(open(basedir+"external.json"))
def listVocabularies(basedir, baseurl):
urls = []
print "WALK DIR "+basedir
for fn in os.listdir(basedir):
if fn.startswith("."):
continue
p=os.path.join(basedir,fn)
if os.path.isdir(p):
urls.extend( listVocabularies(basedir+fn+"/", baseurl+fn+"/") )
else:
if (fn.lower().endswith(".xml")):
url = baseurl+fn
print " found vocabulary at "+url
urls.append(url)
return urls
if __name__ == "__main__":
print 'LOCATED RS.GBIF.ORG FILESYSTEM AT: '+RS_BASE
print 'UPDATE PRODUCTION EXTENSION FILE'
externalProd=listExternal(RS_BASE+"extension/")
urlsCore = listExtensions(RS_BASE+"core/","http://rs.gbif.org/core/")
urlsExt = listExtensions(RS_BASE+"extension/","http://rs.gbif.org/extension/")
writeExtensions(RS_BASE, urlsCore+urlsExt+externalProd)
print 'UPDATE PRODUCTION VOCABULARY FILE'
urlsVoc = listVocabularies(RS_BASE+"vocabulary/","http://rs.gbif.org/vocabulary/")
writeVocabs(RS_BASE, urlsVoc)
print 'UPDATE SANDBOX EXTENSION FILE'
externalDev=listExternal(RS_BASE+"sandbox/extension/")
urlsSandbox = listExtensions(RS_BASE+"sandbox/extension/","http://rs.gbif.org/sandbox/extension/")
urlsSandboxCore = listExtensions(RS_BASE+"sandbox/core/","http://rs.gbif.org/sandbox/core/")
writeExtensions(RS_BASE+'sandbox/', urlsCore+urlsExt+urlsSandbox+externalProd+externalDev+urlsSandboxCore)
print 'UPDATE SANDBOX VOCABULARY FILE'
urlsVoc2 = listVocabularies(RS_BASE+"sandbox/vocabulary/","http://rs.gbif.org/sandbox/vocabulary/")
writeVocabs(RS_BASE+'sandbox/', urlsVoc+urlsVoc2)