-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathXParsers.cpp
421 lines (359 loc) · 12.3 KB
/
XParsers.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
/*
* XParsers: service classes to support parsing of xml domuments
* using standard DOM parsing tools
*
* Class implementation
* September 21, 2003
* Richard Jones
*
* Bundled with the error handler classes are two utility functions:
*
* 1. parseInputDocument - parser implemented using the old-style
* XercesDOMParser interface based on the example code in
* $XERCESCROOT/samples/DOMPrint
*
* 2. buildDOMDocument - parser implemented using the w3c standard
* DOMBuilder interface based on the example code in
* $XERCESCROOT/samples/DOMCount
*
* Implementation Notes:
* ---------------------
* To prevent memory leaks, each of these parsers only retains a single
* document in memory at a time. The next call will destroy the DOM
* tree created on the previous call and return the resources to the
* pool. To prevent this behavior, call the parser with the argument
* perm=true, in which case the resulting DOMDocument will persist for
* the rest of the lifetime of the program.
*
*
* Modification Notes:
* --------------------
* 11/7/2012 DL
* Added EntityResolver class to keep track of all of the XML files
* pulled in by the parser so an md5 checksum could be performed.
* results are written to a FORTRAN function called "md5geom" so the
* checksum can be accessed programatically.
*
* 6/12/2012 DL
* Xerces 3 has done away with the DOMBuilder API, yet retains
* the DOMParser. It seems the code using the routines in this file
* looked to the pre-processor variable OLD_STYLE_XERCES_PARSER to
* decide whether to call parseInputDocument() or buildDOMDocument().
* The former being called if the variable was defined implying
* the former was likely to be deprecated. The simplest change that
* could be made to get this working with XERCES 3 was to turn the
* buildDOMDocument() routine into a wrapper for the parseInputDocument()
* routine. This is done below.
*
*/
#include <fstream>
using namespace std;
#include <xercesc/sax/SAXParseException.hpp>
#include <xercesc/parsers/XercesDOMParser.hpp>
#include <xercesc/framework/LocalFileFormatTarget.hpp>
#include "XParsers.hpp"
#include "XString.hpp"
#include "md5.h"
//#define VERBOSE 1
std::string last_md5_checksum = "";
/*
* FIX_XERCES_getElementById_BUG does a store/load cycle at parsing time
* to fully instantiate entity references on the document tree.
* See xerces-c++ bug 12800 at http://nagoya.apache.org
*/
#define FIX_XERCES_getElementById_BUG true
#define X(str) XString(str).unicode_str()
#define S(str) str.c_str()
xercesc::DOMDocument* parseInputDocument(const XString& xmlFile, bool keep)
{
static xercesc::XercesDOMParser* scratchParser=0;
xercesc::XercesDOMParser* parser;
if (keep)
{
parser = new xercesc::XercesDOMParser;
}
else if (scratchParser == 0)
{
parser = scratchParser = new xercesc::XercesDOMParser;
}
else
{
parser = scratchParser;
}
MyEntityResolver myEntityResolver(xmlFile);
parser->setValidationScheme(xercesc::XercesDOMParser::Val_Auto);
parser->setCreateEntityReferenceNodes(false);
parser->setValidationSchemaFullChecking(true);
parser->setDoNamespaces(true);
parser->setDoSchema(true);
parser->setEntityResolver(&myEntityResolver);
MyOwnErrorHandler errorHandler;
parser->setErrorHandler(&errorHandler);
try
{
parser->parse(xmlFile.c_str());
myEntityResolver.GetMD5_checksum();
}
catch (const xercesc::XMLException& toCatch)
{
std::cerr
<< "\nparseInputDocument: Error during parsing: '" << xmlFile
<< "'\n" << "Exception message is: \n"
<< toCatch.getMessage() << "\n" << std::endl;
return 0;
}
catch (const xercesc::DOMException& toCatch)
{
std::cerr
<< "\nXParsers: Error during parsing: '" << xmlFile << "'\n"
<< "Exception message is: \n"
<< toCatch.msg << "\n" << std::endl;
xercesc::XMLPlatformUtils::Terminate();
return 0;
}
catch (...)
{
std::cerr
<< "\nparseInputDocument: Unexpected exception during parsing: '"
<< xmlFile << "'\n";
xercesc::XMLPlatformUtils::Terminate();
return 0;
}
if (errorHandler.getSawErrors())
{
std::cerr << "\nErrors occured, no output available\n" << std::endl;
return 0;
}
return parser->getDocument();
}
xercesc::DOMDocument* buildDOMDocument(const XString& xmlFile, bool keep)
{
return parseInputDocument(xmlFile, keep);
#if 0 // below no longer works in XERCES 3
xercesc::DOMImplementation *impl =
xercesc:: DOMImplementationRegistry::getDOMImplementation(X("LS"));
static xercesc::DOMBuilder* scratchBuilder=0;
xercesc::DOMBuilder* builder;
if (keep)
{
builder = ((xercesc::DOMImplementationLS*)impl)->createDOMBuilder(
xercesc::DOMImplementationLS::MODE_SYNCHRONOUS, 0);
}
else if (scratchBuilder == 0)
{
builder = scratchBuilder = ((xercesc::DOMImplementationLS*)impl)->
createDOMBuilder(xercesc::DOMImplementationLS::MODE_SYNCHRONOUS,
0);
}
else
{
builder = scratchBuilder;
}
XString tmpFileS = ".tmp-"+xmlFile.basename();
builder->setFeature(xercesc::XMLUni::fgDOMValidation, true);
builder->setFeature(xercesc::XMLUni::fgDOMNamespaces, true);
builder->setFeature(xercesc::XMLUni::fgDOMDatatypeNormalization, true);
builder->setFeature(xercesc::XMLUni::fgDOMEntities, false);
builder->setFeature(xercesc::XMLUni::fgXercesSchemaFullChecking, true);
builder->setFeature(xercesc::XMLUni::fgXercesSchema, true);
MyDOMErrorHandler errHandler;
builder->setErrorHandler(&errHandler);
xercesc::DOMDocument* doc = 0;
try {
builder->resetDocumentPool();
doc = builder->parseURI(xmlFile.c_str());
#if defined FIX_XERCES_getElementById_BUG
xercesc::DOMWriter* writer = ((xercesc::DOMImplementationLS*)impl)->
createDOMWriter();
xercesc::LocalFileFormatTarget* lfft =
new xercesc::LocalFileFormatTarget(X(tmpFileS));
writer->writeNode(lfft,*(doc->getDocumentElement()));
delete lfft;
delete writer;
builder->resetDocumentPool();
doc = builder->parseURI(X(tmpFileS));
#endif
}
catch (const xercesc::XMLException& toCatch) {
std::cout << "Exception message is: \n" << toCatch.getMessage() << "\n";
return 0;
}
catch (const xercesc::DOMException& toCatch) {
std::cout << "Exception message is: \n" << toCatch.msg << "\n";
return 0;
}
catch (...) {
std::cout << "Unexpected Exception \n" ;
return 0;
}
if (errHandler.getSawErrors())
{
std::cerr << "\nErrors occured, no output available\n" << std::endl;
return 0;
}
return doc;
#endif // 0
}
MyOwnErrorHandler::MyOwnErrorHandler() :
fSawErrors(false)
{
}
MyOwnErrorHandler::~MyOwnErrorHandler()
{
}
// Overrides of the SAX ErrorHandler interface
void MyOwnErrorHandler::error(const xercesc::SAXParseException& e)
{
fSawErrors = true;
XString systemId(e.getSystemId());
XString message(e.getMessage());
std::cerr
<< "\nparseInputDocument: Error at file " << S(systemId)
<< ", line " << e.getLineNumber()
<< ", char " << e.getColumnNumber()
<< "\n Message: " << S(message) << std::endl;
}
void MyOwnErrorHandler::fatalError(const xercesc::SAXParseException& e)
{
fSawErrors = true;
XString systemId(e.getSystemId());
XString message(e.getMessage());
std::cerr
<< "\nparseInputDocument: Fatal Error at file " << S(systemId)
<< ", line " << e.getLineNumber()
<< ", char " << e.getColumnNumber()
<< "\n Message: " << S(message) << std::endl;
}
void MyOwnErrorHandler::warning(const xercesc::SAXParseException& e)
{
XString systemId(e.getSystemId());
XString message(e.getMessage());
std::cerr
<< "\nparseInputDocument: Warning at file " << S(systemId)
<< ", line " << e.getLineNumber()
<< ", char " << e.getColumnNumber()
<< "\n Message: " << S(message) << std::endl;
}
void MyOwnErrorHandler::resetErrors()
{
}
MyDOMErrorHandler::MyDOMErrorHandler() :
fSawErrors(false)
{
}
MyDOMErrorHandler::~MyDOMErrorHandler()
{
}
// MyDOMHandlers: Overrides of the DOM ErrorHandler interface
bool MyDOMErrorHandler::handleError(const xercesc::DOMError& domError)
{
fSawErrors = true;
if (domError.getSeverity() == xercesc::DOMError::DOM_SEVERITY_WARNING)
std::cerr << "\nWarning at file ";
else if (domError.getSeverity() == xercesc::DOMError::DOM_SEVERITY_ERROR)
std::cerr << "\nError at file ";
else
std::cerr << "\nFatal Error at file ";
std::cerr
<< XString(domError.getLocation()->getURI()).c_str()
<< ", line " << domError.getLocation()->getLineNumber()
<< ", char " << domError.getLocation()->getColumnNumber()
<< "\n Message: " << XString(domError.getMessage()).c_str()
<< std::endl;
return true;
}
void MyDOMErrorHandler::resetErrors()
{
fSawErrors = false;
}
//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
//----------------------------------
// MyEntityResolver (constructor)
//----------------------------------
MyEntityResolver::MyEntityResolver(const XString& xmlFile)
{
xml_filenames.push_back(xmlFile);
string fname = xmlFile;
size_t pos = fname.find_last_of('/');
if(pos != string::npos){
path = fname.substr(0,pos) + "/";
}
}
//----------------------------------
// MyEntityResolver (destructor)
//----------------------------------
MyEntityResolver::~MyEntityResolver()
{
}
//----------------------------------
// resolveEntity
//----------------------------------
xercesc::InputSource* MyEntityResolver::resolveEntity(const XMLCh* const publicId, const XMLCh* const systemId)
{
/// This method gets called from the xerces parser each time it
/// opens a file (except for the top-level file). For each of these,
/// record the name of the file being opened, then just return NULL
/// to have xerces handle opening the file in the normal way.
// Do some backflips to get strings into std::string format
std::string my_publicId = "";
std::string my_systemId = "";
if(publicId){
char *my_publicId_ptr = xercesc::XMLString::transcode(publicId);
my_publicId = my_publicId_ptr;
xercesc::XMLString::release(&my_publicId_ptr);
}
if(systemId){
char *my_systemId_ptr = xercesc::XMLString::transcode(systemId);
my_systemId = my_systemId_ptr;
xercesc::XMLString::release(&my_systemId_ptr);
}
//std::cerr<<"publicId="<<my_publicId<<" systemId="<<my_systemId<<std::endl;
// The systemId seems to be the one we want
xml_filenames.push_back(path + my_systemId);
return NULL; // have xerces handle this using its defaults
}
//----------------------------------
// GetXMLFilenames
//----------------------------------
std::vector<std::string> MyEntityResolver::GetXMLFilenames(void)
{
return xml_filenames;
}
//----------------------------------
// GetMD5_checksum
//----------------------------------
std::string MyEntityResolver::GetMD5_checksum(void)
{
/// This will calculate an MD5 checksum using all of the files currently
/// in the list of XML files. To do this, it opens each file and reads it
/// in, in its entirety, updating the checksum as it goes. The checksum is
/// returned as a hexadecimal string.
md5_state_t pms;
md5_init(&pms);
for(unsigned int i=0; i<xml_filenames.size(); i++){
//std::cerr<<".... Adding file to MD5 checksum : " << xml_filenames[i] << std::endl;
ifstream ifs(xml_filenames[i].c_str());
if(!ifs.is_open())continue;
// get length of file:
ifs.seekg (0, ios::end);
unsigned int length = ifs.tellg();
ifs.seekg (0, ios::beg);
#if VERBOSE
// print summary
std::cout << xml_filenames[i] << " (" << length << " bytes)" << std::endl;
#endif
// allocate memory:
char *buff = new char [length];
// read data as a block:
ifs.read (buff,length);
ifs.close();
md5_append(&pms, (const md5_byte_t *)buff, length);
delete[] buff;
//std::cerr<<".... Adding file to MD5 checksum : " << xml_filenames[i] << " (size=" << length << ")" << std::endl;
}
md5_byte_t digest[16];
md5_finish(&pms, digest);
char hex_output[16*2 + 1];
for(int di = 0; di < 16; ++di) sprintf(hex_output + di * 2, "%02x", digest[di]);
return last_md5_checksum = hex_output;
}