You must be signed in to change notification settings - Fork 0
Copy pathturtle.js
547 lines (518 loc) · 14.8 KB
//--------------------- about this script -------------------------ParseTurtle.prototype.meta = {"@prefix": "<http://purl.org/net/ns/doas#>","@about": "<http://www.kanzaki.com/parts/turtle.js>", a: ":JavaScript", title: "An experimental Turtle parser", shortdesc: "Parse Turtle as simple as possible. Not perfect.", created: "2006-02-22", release: {revision: "0.45.1", created: "2006-03-11"}, author: {name: "KANZAKI, Masahide", homepage: "<http://www.kanzaki.com/>"}, page: "<http://www.kanzaki.com/works/2006/misc/0308turtle.html>", license: "<http://creativecommons.org/licenses/LGPL/2.1/>"};//-----------------------------------------------------------------/** * Constructor: ex) var turtle = new ParseTurtle; */function ParseTurtle(){ return this;}/** * Receives RDF/Turtle string and returns array of triples * Usually, call this function to get RDF triples array from Turtle string: * ex) rdf = turtle.parse(turtle_string); * @param str : Turtle string * @return RDF triples as JSON array. */ParseTurtle.prototype.parse = function (str){ this.triples = []; this.ns = { "rdf:":"http://www.w3.org/1999/02/22-rdf-syntax-ns#", "xsd:":"http://www.w3.org/2001/XMLSchema#", "_:":"genid:" }; this.bid = 0; var status = 's'; // s,p,o or et(end triple)-- status of current token var arg = { 'line': str, 'type': '', 'lang': '', 'datatype': '' }; this.loopcount = 0; //debug this.parse_triple(arg, '', status); if(arg.line) this.parseError("Unbalanced parse end: " + arg.line,arg); return this.triples;}/** * Parses string and generates JSON array. * @param arg : object reference of {line,type..} to return updated value * @param subj : subject uri or qname * @param status : starting (caller's) status of the token */ParseTurtle.prototype.parse_triple = function (arg, subj, status){ var token;//, pfx = ''; var tmp = {}; // temporary stores p-o list for each p as array var tmp_po_obj = {}; // temporary p-o object to pass by reference var prop, obj; // property uri or qname var init_status = status; // determins who calls this function while(arg.line){ token = this.get_token(arg); if(arg.type == 'skip') continue; //else if(arg.type == 'separator') // status = 'et'; // e.g. bnode subject might be followed by '.' switch(status){ case 's': if(arg.type == 'at'){ if(token == '@prefix'){ status = 'pfx'; }else this.tokenError(token,"not a Turtle directive.",arg); }else if(token == '['){ subj = this.gen_bnodeid(); this.parse_triple(arg, subj, 'p'); status = 'pbn'; // predicate for bnode }else if(arg.type == 'syntax'){ this.tokenError(token,"cannot have this token here (subject).",arg); }else if(arg.type == 'literal'){ this.tokenError(token,"cannot have literal as subject.",arg); }else{ if(arg.type == 'qname'){ subj = this.qname2uri(token); }else{ subj = token; } status = 'p'; // next token should be predicate } break; case 'pfx': if(! token.match(/:$/)) this.tokenError(token, "prefix must end with ':'.",arg); prop = token; status = 'ns'; // next token should be ns uri break; case 'pbn': if(token == '.'){ // if '.' after bnode subject, next should be a subject status = 's';break;} case 'p': if(token == ']'){ return; // parse predicate again }else if(arg.type == 'syntax'){ this.tokenError(token, "cannot have this token here (predicate).",arg); }else if(arg.type == 'literal') this.tokenError(token,"cannot have literal as predicate.",arg); prop = token; if(prop == 'a') prop = 'rdf:type'; if(arg.type == 'qname'){ if(token.substr(0,2) == '_:') this.tokenError(token, "cannot have bnode here (predicate).",arg); prop = this.qname2uri(prop); } status = 'o'; // next token should be object break; case 'ns': this.ns[prop] = token; status = 'et'; break; case 'o': if(token == '['){ obj = this.gen_bnodeid(); this.parse_triple(arg, obj, 'p'); arg.type = 'bnode'; }else if(token == '('){ obj = this.gen_bnodeid(); var isNil = this.parse_collection(arg, obj); if(isNil) obj = isNil; //this.bid--; // moved to parse_collection() }else if(arg.type == 'syntax'){ this.tokenError(token, "cannot have this token here (object).", arg); }else if(arg.type == 'qname'){ obj = this.qname2uri(token); }else{ obj = token; } this.add_triple(subj, prop, obj, arg); status = 'et'; break; case 'et': switch(token){ case ';': status = 'p'; break; case ',': status = 'o'; break; case '.': status = 's'; break; case ']': return; // exit from recursive call -- maybe need nest check default: this.tokenError(token, "cannot have this token here (end of triple).", arg); } break; } } if(status != 's') this.parseError("Unexpected end status: "+status,arg);}/** * Parses RDF Collection, i.e. inside ( ... ) of Turtle * @param arg : object reference of {line,type..} to return updated value * @param subj : subject bnode of the collection */ParseTurtle.prototype.parse_collection = function (arg, subj){ var token, obj, fnode, nodetype; var RDFNS = this.ns['rdf:']; while(arg.line){ token = this.get_token(arg) if(arg.type == 'qname') token = this.qname2uri(token); if(token == ')'){ this.bid--; // didn't use bnode obj, so decrement index if(obj) this.add_triple(subj, RDFNS+'rest', RDFNS+'nil', arg); else return RDFNS+'nil'; break; }else { if(obj){ this.add_triple(subj, RDFNS+'rest', obj, arg, 'resource'); subj = obj; } if(token == '['){ fnode = this.gen_bnodeid(); this.parse_triple(arg, fnode, 'p'); arg.type = 'bnode'; }else if(token == '('){ fnode = this.gen_bnodeid(); var isNil = this.parse_collection(arg, fnode); if(isNil) fnode = isNil; arg.type = 'bnode'; }else if(arg.type == 'syntax'){ this.tokenError(token, "not allowed in Collection.",arg); }else{ fnode = token; } this.add_triple(subj, RDFNS+'first', fnode, arg); obj = this.gen_bnodeid(); } } return '';}/** * Adds resulting triple to array (this.triples) in the same manner as JimLey's RDF parser * @param s : triple's subject * @param p : triple's predicate * @param o : triple's object * @param arg : object reference of {line,type,lang,datatype} * @param type : explicit type of object (optional. usually get from arg.type) */ParseTurtle.prototype.add_triple = function (s, p, o, arg, type){ if(arg.type == 'number') type = 'literal'; if(! type) type = (arg.type == 'literal') ? 'literal' : 'resource'; this.triples.push({ "subject":s, "predicate":p, "object":o, "type":type, "lang":arg.lang, "datatype":arg.datatype });}/** * Tokenize string into Trutle tokens. Trims line after tokenize. * @param arg : object reference of {line,type} to return updated value * @return a Turtle token. */ParseTurtle.prototype.get_token = function (arg){ var val, line = arg.line.replace(/^[\n\s]*/,''); if(this.loopcount++ > 5000) this.parseError("too much loop -- possibly error ?",arg); switch(line.charAt(0)){ case '#': // rest of the line is comment arg.line = line.replace(/#(.*?)$/m,''); arg.type = 'skip'; return ''; case '': arg.line = line.substr(1); arg.type = 'skip'; return ''; case '"': if(line.substr(1,2) == '""'){ //longString // complicated because javascript doesn't have 's' switch if(line.substr(1,5) == '"""""'){ arg.line = line.substr(6); val = ''; }else{ while(! line.match(/"""(.*?)([^\\])"""/)){ if(!RegExp.rightContext) this.matchError('"""','longString',arg); line = line.replace(/\n/,'\\n'); } arg.line = RegExp.rightContext; val = RegExp.$1 + RegExp.$2; val = val.replace(/\\\"/g, '"'); } }else{ //String if(line.charAt(1) == '"'){ arg.line = line.substr(2); val = ''; }else if(line.match(/"(.*?)([^\\])"/)){ arg.line = RegExp.rightContext; val = RegExp.$1 + RegExp.$2; }else this.matchError('"', "literal",arg); } //val = val.replace(/\t/g,"\\t"); this.get_literal_info(arg); arg.type = 'literal'; return val; case '<': arg.type = 'uri'; arg.lang = null; arg.datatype = null; if(line.match(/(.*?)>/)){ arg.line = RegExp.rightContext; return RegExp.$1.substr(1); // omit '<' }else this.matchError(">","URI",arg); case '@': arg.type = 'at'; arg.line = line.replace(/([^\s]+)/,''); // match at least '@' return RegExp.$1; case '^': if(line.charAt(1) == '^'){ arg.type = 'datatype'; arg.line = line.substr(2); return '^^'; }else this.tokenError('^', "not allowed here.",arg); case '?': if(arg.q == 'sparql'){ arg.type = 'variable'; if(line.match(/\?([^\s]+)/)){ arg.line = RegExp.rightContext; return '?' + RegExp.$1; }else this.tokenError('?', "variable name not found",arg); }else this.tokenError('?', "not allowed here.",arg); case '.': case ';': case ',':// arg.type = 'separator';// arg.line = line.substr(1);// return line.charAt(0); case '[': case ']': case '(': case ')': arg.type = 'syntax'; arg.line = line.substr(1); return line.charAt(0); default: //arg.line = line.replace(/([^\s\;\.\,\]\)\^]+)/,''); if(! line.match(/^([^\s\x21-\x2a,\.\/\x3b-\x40\x5b-\x5e\x60\x7b-\x7f]+)/)){ val = line.match(/^[^\s\.\;\:\,]+/); this.tokenError(val, "Unrecognized token",arg); }arg.line = RegExp.rightContext; val = RegExp.$1; arg.lang = null; if(val == 'true' || val == 'false'){ arg.type = 'literal' arg.datatype = this.ns['xsd:'] + 'boolean' }else if(isNaN(val)) arg.type = 'qname' else{ arg.type = 'number'; if(arg.line.match(/^(\.[0-9eE]+)/)){ arg.line = RegExp.rightContext; val += RegExp.$1; } if(val.match(/[eE]/)) arg.datatype = this.ns['xsd:'] + 'double' else if(val.indexOf('.') > -1) arg.datatype = this.ns['xsd:'] + 'decimal' else arg.datatype = this.ns['xsd:'] + 'integer'; if(val.charAt(0) == '0') val = val * 1; return val; } return val; }}/** * Test for lang tag and datatype for literal token * @param arg : object reference of {line,type,lang,datatype} * to return updated value and lang or datatype */ParseTurtle.prototype.get_literal_info = function (arg){ switch(arg.line.charAt(0)){ case '@': arg.line = arg.line.substr(1); arg.lang = this.get_token(arg); arg.datatype = null; break; case '^': this.get_token(arg); // trim second '^'; var dt = this.get_token(arg); arg.datatype = arg.type == 'qname' ? this.qname2uri(dt) : dt; arg.lang = null; break; default: arg.datatype = null; arg.lang = null; }}/** * Generates an incremented blank node id, as the same form as JimLey's parser */ParseTurtle.prototype.gen_bnodeid = function (){ this.bid++; return 'genid:' + this.bid ;}/** * Expands QName to fully qualified URI * @param qname : QName * @return expanded URI */ParseTurtle.prototype.qname2uri = function (qname){ if(!qname.match(/^(.*?):/)) this.tokenError(qname, 'No prefix where QName expected.'); return this.ns[RegExp.$1+':'] + RegExp.rightContext;}/** * Throws exception on regular expression matching error, otherwise * browser will be trapped by infinite loop. * @param etag : missing closing token * @param type : processing token type * @param arg : context object */ParseTurtle.prototype.matchError = function (etag, type, arg){ this.parseError("Could not find closing " + etag +" for " + type, arg);}ParseTurtle.prototype.tokenError = function (token, msg, arg){ this.parseError("'"+token + "' --> " + msg, arg);}/** * Throws exception and reports an error * @param srt : error message * @param arg : context object */ParseTurtle.prototype.parseError = function (str, arg){ var msg = arg ? str + "\nwhen processing: " + arg.line.substr(0,40) + "\ntype: " + arg.type : str throw msg;}/************************************* * Experiments to map Turtle to JSON **************************************//** * Receives RDF/Turtle triples and returns corresponding hierarchy JSON tree. * Usually, call this function to get JSON representation from Turtle string: * ex) json = turtle.parse_to_json(turtle_string); * @param str : Turtle string * @return RDF triples as JSON object. */ParseTurtle.prototype.parse_to_json = function (str){ this.rdf = {}; var status = 's'; // s,p,o or et(end triple)-- status of current processing token var arg = { 'line': str, 'type':''}; this.parse_subtree(arg, this.rdf, status); return this.rdf;}/** * Parses string and generates JSON subtree. * @param arg : object reference of {line,type} to return updated value * @param dest : destination object to add resulting JSON prop-value */ParseTurtle.prototype.parse_subtree = function (arg, dest, status){ var token, pfx = ''; var tmp = {}; // temporary stores p-o list for each p as array var tmp_po_obj = {}; // temporary p-o object to pass by reference var prop; // property name for JSON object var init_status = status; // determins who calls this function while(arg.line){ token = this.get_token(arg); switch(status){ case 's': case 'c': //later if(arg.type == 'at'){ pfx = token; status = 'pfx'; }else{ if(arg.type == 'uri') dest["@about"] = token; status = 'p'; } break; case 'pfx': prop = token == ':' ? pfx : pfx + ':' + token.slice(0,-1); status = 'o'; break; case 'p': prop = token.replace(/^:/,''); status = 'o'; break; case 'o': if(typeof(tmp[prop]) == "undefined") tmp[prop] = []; if(token == '['){ tmp_po_obj = {}; this.parse_subtree(arg, tmp_po_obj, 'p'); tmp[prop].push(tmp_po_obj); status = 'et'; }else if(token == '('){ tmp_po_obj = {}; this.parse_subtree(arg, tmp_po_obj, 'c'); tmp[prop].push(tmp_po_obj); this.flush_property(tmp,dest); status = 'o'; }else if(token == ')'){ //init_status = 'p'; return; }else if(arg.type == 'syntax'){ ; ; }else{ if(arg.type == 'uri') token = '<' + token + '>'; tmp[prop].push(token); //status = init_status == 'o' ? 'o' : 'et'; status = init_status == 'c' ? 'o' : 'et'; } break; case 'et': switch(token){ case ';': status = 'p'; break; case ',': status = 'o'; break; case '.': status = 's'; this.flush_property(tmp,dest); break; case ']': case ')': this.flush_property(tmp,dest); return; } break; } } if(tmp) this.flush_property(tmp,dest);}/** * Add property-value to destination object, so that the result will be * simple value if only one member, array otherwise * @param tmp : object that temporary holds p-v array for each property * @param dest : destination object to add resulting JSON prop-value */ParseTurtle.prototype.flush_property = function (tmp,dest){ for(key in tmp) //alert('p = '+key + ', o = '+tmp[key][0]); if(tmp[key].length == 1) dest[key] = tmp[key][0]; else dest[key] = tmp[key]; tmp = {};}