From d4ce46b74699499e380d04078373671063110b5e Mon Sep 17 00:00:00 2001 From: Sebastiaan Koppe Date: Sun, 19 Jan 2020 19:44:41 +0100 Subject: [PATCH 1/6] keep track of the longest failed match to improve error messages --- pegged/peg.d | 69 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 61 insertions(+), 8 deletions(-) diff --git a/pegged/peg.d b/pegged/peg.d index 4d531cb1..abb95b17 100644 --- a/pegged/peg.d +++ b/pegged/peg.d @@ -18,7 +18,7 @@ Writing tests the long way is preferred here, as it will avoid the circular dependency. */ -import std.algorithm: equal, map, startsWith; +import std.algorithm: equal, map, startsWith, max, countUntil; import std.uni : isAlpha, icmp; import std.array; import std.conv; @@ -248,6 +248,9 @@ struct ParseTree ParseTree[] children; /// The sub-trees created by sub-rules parsing. + size_t failEnd; // The furthest this tree could match the input (including !successful rules). + ParseTree[] failedChild; /// The !successful child that could still be partially parsed. + /** Basic toString for easy pretty-printing. */ @@ -343,6 +346,8 @@ struct ParseTree result.input = input; result.begin = begin; result.end = end; + result.failEnd = failEnd; + result.failedChild = map!(p => p.dup)(failedChild).array(); result.children = map!(p => p.dup)(children).array(); return result; } @@ -719,7 +724,7 @@ template literal(string s) if (p.end+s.length <= p.input.length && p.input[p.end..p.end+s.length] == s) return ParseTree(name, true, [s], p.input, p.end, p.end+s.length); else - return ParseTree(name, false, [lit], p.input, p.end, p.end); + return ParseTree(name, false, [lit], p.input, p.end, p.end, null, p.end); } ParseTree literal(string input) @@ -1262,7 +1267,8 @@ template and(rules...) if (rules.length > 0) //&& !node.name.startsWith("drop!(") && node.matches !is null //&& node.begin != node.end - ); + ) + || (node.failEnd > node.end); } version (tracer) @@ -1281,6 +1287,7 @@ template and(rules...) if (rules.length > 0) } ParseTree temp = r(result); result.end = temp.end; + result.failEnd = max(result.failEnd, temp.failEnd); if (temp.successful) { if (keepNode(temp)) @@ -1296,9 +1303,17 @@ template and(rules...) if (rules.length > 0) } else { - result.children ~= temp;// add the failed node, to indicate which failed - if (temp.matches.length > 0) - result.matches ~= temp.matches[$-1]; + auto firstLongestFailedMatch = result.children.countUntil!(c => c.failEnd > temp.end); + if (firstLongestFailedMatch == -1) { + result.children ~= temp;// add the failed node, to indicate which failed + // result.end = temp.end; + if (temp.matches.length > 0) + result.matches ~= temp.matches[$-1]; + } else { + // don't add the failed node because a previous one already failed further back + result.children = result.children[0 .. firstLongestFailedMatch+1]; // discard any intermediate correct nodes + failedChildFixup(result.children[firstLongestFailedMatch]); + } version (tracer) { if (shouldTrace(getName!(r)(), p)) @@ -1333,6 +1348,29 @@ template and(rules...) if (rules.length > 0) } } +// This function rewrites the ParseTree to move the failedChild into its parents children +// this is done whenever an 'and' rule fails and there are such a child further down +bool failedChildFixup(ref ParseTree p) { + if (p.failedChild.length > 0) { + p.children ~= p.failedChild[0]; + p.failedChild = []; + p.successful = false; + p.end = p.failEnd; + p.failEnd = 0; + return true; + } else { + foreach(ref c; p.children) { + if (failedChildFixup(c)) { + p.end = c.end; + p.successful = false; + p.failEnd = 0; + return true; + } + } + } + return false; +} + unittest // 'and' unit test { alias literal!"abc" abc; @@ -1524,6 +1562,11 @@ template or(rules...) if (rules.length > 0) { temp.children = [temp]; temp.name = name; + // if there is a child that failed but parsed more + if (longestFail.failEnd > temp.end) { + temp.failEnd = longestFail.failEnd; + temp.failedChild = [longestFail]; + } version (tracer) { if (shouldTrace(getName!(r)(), p)) @@ -2166,6 +2209,7 @@ template zeroOrMore(alias r) result.matches ~= temp.matches; result.children ~= temp; result.end = temp.end; + result.failEnd = max(result.failEnd, temp.failEnd); version (tracer) { if (shouldTrace(getName!(r)(), p)) @@ -2173,6 +2217,10 @@ template zeroOrMore(alias r) } temp = r(result); } + if (temp.end > result.failEnd) { + result.failedChild = [temp]; + result.failEnd = temp.end; + } result.successful = true; version (tracer) { @@ -2328,6 +2376,7 @@ template oneOrMore(alias r) result.matches ~= temp.matches; result.children ~= temp; result.end = temp.end; + result.failEnd = max(result.failEnd, temp.failEnd); version (tracer) { if (shouldTrace(getName!(r)(), p)) @@ -2335,6 +2384,10 @@ template oneOrMore(alias r) } temp = r(result); } + if (temp.end > result.failEnd) { + result.failedChild = [temp]; + result.failEnd = temp.end; + } result.successful = true; } version (tracer) @@ -2451,9 +2504,9 @@ template option(alias r) } ParseTree result = r(p); if (result.successful) - return ParseTree(name, true, result.matches, result.input, result.begin, result.end, [result]); + return ParseTree(name, true, result.matches, result.input, result.begin, result.end, [result], result.failEnd); else - return ParseTree(name, true, [], p.input, p.end, p.end, null); + return ParseTree(name, true, [], p.input, p.end, p.end, null, result.failEnd, [result]); } ParseTree option(string input) From f8163be4f79e49c3a0d6e30ff00e0ba06692e2f3 Mon Sep 17 00:00:00 2001 From: Sebastiaan Koppe Date: Tue, 21 Jan 2020 15:14:14 +0100 Subject: [PATCH 2/6] Fix wording, remove commented out code --- pegged/peg.d | 52 ++++++++++++++++++++++++++++------------------------ 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/pegged/peg.d b/pegged/peg.d index abb95b17..ad4ea3ff 100644 --- a/pegged/peg.d +++ b/pegged/peg.d @@ -1244,7 +1244,6 @@ and that the second subrule ('[a-z]') failed at position 3 (so, on '1'). */ template and(rules...) if (rules.length > 0) { - string ctfeGetNameAnd() { string name = "and!("; @@ -1306,12 +1305,14 @@ template and(rules...) if (rules.length > 0) auto firstLongestFailedMatch = result.children.countUntil!(c => c.failEnd > temp.end); if (firstLongestFailedMatch == -1) { result.children ~= temp;// add the failed node, to indicate which failed - // result.end = temp.end; if (temp.matches.length > 0) result.matches ~= temp.matches[$-1]; } else { // don't add the failed node because a previous one already failed further back result.children = result.children[0 .. firstLongestFailedMatch+1]; // discard any intermediate correct nodes + // This current 'and' rule has failed parsing and there is a successful child + // that had a longer failing match. We now want to revisit that child and modify it + // so that it is no longer successful and we want to move its failedChild into its children. failedChildFixup(result.children[firstLongestFailedMatch]); } version (tracer) @@ -1346,29 +1347,32 @@ template and(rules...) if (rules.length > 0) { return name; } -} -// This function rewrites the ParseTree to move the failedChild into its parents children -// this is done whenever an 'and' rule fails and there are such a child further down -bool failedChildFixup(ref ParseTree p) { - if (p.failedChild.length > 0) { - p.children ~= p.failedChild[0]; - p.failedChild = []; - p.successful = false; - p.end = p.failEnd; - p.failEnd = 0; - return true; - } else { - foreach(ref c; p.children) { - if (failedChildFixup(c)) { - p.end = c.end; - p.successful = false; - p.failEnd = 0; - return true; - } - } - } - return false; + // A child ParseTree has kept track of an alternate ParseTree (in failedChild) that matches longer. + // whenever the 'and' rule fails we want to rewrite that child so that the failedChild is + // moved into its children, the successful is set to false, the end is set the its failEnd, + // the failEnd is reset, and all that info is propagated upwards the tree so intermediate + // nodes reflect the proper state. + bool failedChildFixup(ref ParseTree p) { + if (p.failedChild.length > 0) { + p.children ~= p.failedChild[0]; + p.failedChild = []; + p.successful = false; + p.end = p.failEnd; + p.failEnd = 0; + return true; + } else { + foreach(ref c; p.children) { + if (failedChildFixup(c)) { + p.end = c.end; + p.successful = false; + p.failEnd = 0; + return true; + } + } + } + return false; + } } unittest // 'and' unit test From 014394fc95677e5b245aa8168e937d22a7eb3378 Mon Sep 17 00:00:00 2001 From: Sebastiaan Koppe Date: Tue, 21 Jan 2020 15:54:31 +0100 Subject: [PATCH 3/6] Add unittests for failedChild --- pegged/peg.d | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/pegged/peg.d b/pegged/peg.d index ad4ea3ff..97fc99d2 100644 --- a/pegged/peg.d +++ b/pegged/peg.d @@ -1445,6 +1445,62 @@ unittest // 'and' unit test , "'abc' 'de' 'f' has two child on 'abc_efghi', the one from 'abc' (success) and the one from 'de' (failure)."); } +version (unittest) { + static ParseTree getError(ref ParseTree p) { + if (p.children.length > 0) + return getError(p.children[$-1]); + return p; + } +} + +unittest // 'and' unit test with zeroOrMore and longest failing match +{ + alias literal!"abc" A; + alias literal!"def" B; + alias literal!"ghi" C; + + alias and!(zeroOrMore!(and!(A,B)), C) Thing; + + ParseTree input = ParseTree("",false,[], "abc"); + ParseTree result = Thing(input); + + assert(!result.successful); + assert(getError(result).matches[$-1] == "\"def\"", "and!(zeroOrMore!(and!(literal!\"abc\", literal!\"def\")), literal!\"ghi\") should expected def when input is \"abc\""); + assert(result.matches == []); +} + +unittest // 'and' unit test with option and longest failing match +{ + alias literal!"abc" A; + alias literal!"def" B; + alias literal!"ghi" C; + + alias and!(option!(and!(A,B)), C) Thing; + + ParseTree input = ParseTree("",false,[], "abc"); + ParseTree result = Thing(input); + + assert(!result.successful); + assert(getError(result).matches[$-1] == "\"def\"", "and!(option!(and!(literal!\"abc\", literal!\"def\")), literal!\"ghi\") should expected def when input is \"abc\""); + assert(result.matches == []); +} + +unittest // 'and' unit test with oneOrMore and longest failing match +{ + alias literal!"abc" A; + alias literal!"def" B; + alias literal!"ghi" C; + + alias and!(oneOrMore!(and!(A,B)), C) Thing; + + ParseTree input = ParseTree("",false,[], "abcdefabc"); + ParseTree result = Thing(input); + + assert(!result.successful); + assert(getError(result).matches[$-1] == "\"def\"", "and!(oneOrMore!(and!(literal!\"abc\", literal!\"def\")), literal!\"ghi\") should expected def when input is \"abcdefabc\""); + assert(result.matches == ["abc", "def"]); +} + template wrapAround(alias before, alias target, alias after) { ParseTree wrapAround(ParseTree p) From 37469f6cc13adee8207861200066e39ea909d4bc Mon Sep 17 00:00:00 2001 From: Sebastiaan Koppe Date: Mon, 3 Feb 2020 10:52:38 +0100 Subject: [PATCH 4/6] Fix several bugs/enchancements - note how much of a literal is parsed when failed - only do fixup for children which match failEnd - recalculate failEnd in fixup - have the `or` template return a list of failed children instead of last one - propagate failEnd better in `oneOrMore` and `option` - keep failedChild in decimateTree --- pegged/peg.d | 84 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/pegged/peg.d b/pegged/peg.d index 97fc99d2..1220526e 100644 --- a/pegged/peg.d +++ b/pegged/peg.d @@ -18,7 +18,7 @@ Writing tests the long way is preferred here, as it will avoid the circular dependency. */ -import std.algorithm: equal, map, startsWith, max, countUntil; +import std.algorithm: equal, map, startsWith, max, countUntil, maxElement, filter; import std.uni : isAlpha, icmp; import std.array; import std.conv; @@ -721,10 +721,13 @@ template literal(string s) ParseTree literal(ParseTree p) { enum lit = "\"" ~ s ~ "\""; - if (p.end+s.length <= p.input.length && p.input[p.end..p.end+s.length] == s) + import std.algorithm : commonPrefix; + auto prefix = p.input[p.end..$].commonPrefix(s); + + if (prefix.length == s.length) return ParseTree(name, true, [s], p.input, p.end, p.end+s.length); else - return ParseTree(name, false, [lit], p.input, p.end, p.end, null, p.end); + return ParseTree(name, false, [lit], p.input, p.end, p.end, null, p.end + prefix.length); } ParseTree literal(string input) @@ -1267,7 +1270,7 @@ template and(rules...) if (rules.length > 0) && node.matches !is null //&& node.begin != node.end ) - || (node.failEnd > node.end); + || (node.failEnd >= node.end); } version (tracer) @@ -1313,8 +1316,10 @@ template and(rules...) if (rules.length > 0) // This current 'and' rule has failed parsing and there is a successful child // that had a longer failing match. We now want to revisit that child and modify it // so that it is no longer successful and we want to move its failedChild into its children. - failedChildFixup(result.children[firstLongestFailedMatch]); + failedChildFixup(result.children[firstLongestFailedMatch], result.children[firstLongestFailedMatch].failEnd); } + result.end = result.children.map!(c => c.end).maxElement; + result.failEnd = result.children.map!(c => c.failEnd).maxElement; version (tracer) { if (shouldTrace(getName!(r)(), p)) @@ -1353,25 +1358,28 @@ template and(rules...) if (rules.length > 0) // moved into its children, the successful is set to false, the end is set the its failEnd, // the failEnd is reset, and all that info is propagated upwards the tree so intermediate // nodes reflect the proper state. - bool failedChildFixup(ref ParseTree p) { + bool failedChildFixup(ref ParseTree p, size_t failEnd) { if (p.failedChild.length > 0) { p.children ~= p.failedChild[0]; p.failedChild = []; p.successful = false; p.end = p.failEnd; - p.failEnd = 0; + p.failEnd = p.children.map!(c => c.failEnd).maxElement(); return true; } else { + bool result = false; foreach(ref c; p.children) { - if (failedChildFixup(c)) { + if (c.failEnd != failEnd) + continue; + if (failedChildFixup(c, failEnd)) { p.end = c.end; p.successful = false; - p.failEnd = 0; - return true; + p.failEnd = p.children.map!(c => c.failEnd).maxElement(); + result = true; } } + return result; } - return false; } } @@ -1690,9 +1698,8 @@ template or(rules...) if (rules.length > 0) longestFail.matches = longestFail.matches.length == 0 ? [orErrorString] : longestFail.matches[0..$-1] // discarding longestFail error message ~ [orErrorString]; // and replacing it by the new, concatenated one. - longestFail.name = name; - longestFail.begin = p.end; - return longestFail; + auto children = results[].filter!(r => max(r.end, r.failEnd) >= maxFailedLength).array(); + return ParseTree(name, false, longestFail.matches, p.input, p.end, longestFail.end, children, children.map!(c => c.failEnd).maxElement); } ParseTree or(string input) @@ -2277,9 +2284,10 @@ template zeroOrMore(alias r) } temp = r(result); } - if (temp.end > result.failEnd) { + auto maxFail = max(temp.failEnd, temp.end); + if (maxFail > result.failEnd && maxFail > result.end) { result.failedChild = [temp]; - result.failEnd = temp.end; + result.failEnd = maxFail; } result.successful = true; version (tracer) @@ -2444,9 +2452,10 @@ template oneOrMore(alias r) } temp = r(result); } - if (temp.end > result.failEnd) { + auto maxFail = max(temp.failEnd, temp.end); + if (maxFail > result.failEnd && maxFail > result.end) { result.failedChild = [temp]; - result.failEnd = temp.end; + result.failEnd = maxFail; } result.successful = true; } @@ -2566,7 +2575,7 @@ template option(alias r) if (result.successful) return ParseTree(name, true, result.matches, result.input, result.begin, result.end, [result], result.failEnd); else - return ParseTree(name, true, [], p.input, p.end, p.end, null, result.failEnd, [result]); + return ParseTree(name, true, [], p.input, p.end, p.end, null, max(result.end,result.failEnd), [result]); } ParseTree option(string input) @@ -3594,8 +3603,10 @@ mixin template decimateTree() { import std.algorithm : startsWith; - if ( (isRule(child.name) && child.matches.length != 0) - || !child.successful && child.children.length == 0) + if ( (isRule(child.name)) + || (!child.successful && child.children.length == 0) + || (!child.successful && child.name.startsWith("or!") && child.children.length > 1) + || (!pt.successful && child.successful && child.children.length == 0 && child.failedChild.length > 0)) { child.children = filterChildren(child); result ~= child; @@ -3612,6 +3623,37 @@ mixin template decimateTree() } return result; } + void filterFailedChildren(ref ParseTree pt) + { + foreach(ref child; pt.children) + { + filterFailedChildren(child); + import std.algorithm : startsWith; + + if ( (isRule(child.name)) + || (!child.successful && child.children.length == 0) + || (!child.successful && child.name.startsWith("or!") && child.children.length > 1) + || (!pt.successful && child.successful && child.children.length == 0 && child.failedChild.length > 0)) + { + } + else if (child.name.startsWith("keep!(")) // 'keep' node are never discarded. + // They have only one child, the node to keep + { + } + else if (child.failedChild.length > 0)// discard this node, but see if its children contain nodes to keep + { + pt.failedChild ~= child.failedChild; + child.failedChild = []; + } + } + foreach(ref child; pt.failedChild) + { + filterFailedChildren(child); + child.children = filterChildren(child); + } + } + if (!p.successful) + filterFailedChildren(p); p.children = filterChildren(p); return p; } From 9add4b58420b0d8dcc67d14986240568a196e19a Mon Sep 17 00:00:00 2001 From: Sebastiaan Koppe Date: Wed, 5 Feb 2020 12:54:45 +0100 Subject: [PATCH 5/6] Fix unicode bug and fix decimateTree - wanted to iterate byGrapheme, but that won't work in ctfe - decimateTree should keep nodes with no matches when the root parsetree has failed (there may be failedChild elements in there useful for error reporting) --- pegged/peg.d | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/pegged/peg.d b/pegged/peg.d index 1220526e..3a19400a 100644 --- a/pegged/peg.d +++ b/pegged/peg.d @@ -721,14 +721,16 @@ template literal(string s) ParseTree literal(ParseTree p) { enum lit = "\"" ~ s ~ "\""; - import std.algorithm : commonPrefix; - auto prefix = p.input[p.end..$].commonPrefix(s); - if (prefix.length == s.length) + if (p.end+s.length <= p.input.length && p.input[p.end..p.end+s.length] == s) return ParseTree(name, true, [s], p.input, p.end, p.end+s.length); - else + else { + import std.algorithm : commonPrefix; + import std.utf : byCodeUnit; + auto prefix = p.input[p.end..$].byCodeUnit.commonPrefix(s.byCodeUnit); return ParseTree(name, false, [lit], p.input, p.end, p.end, null, p.end + prefix.length); - } + } + } ParseTree literal(string input) { @@ -3596,6 +3598,8 @@ mixin template decimateTree() { if(p.children.length == 0) return p; + bool parseFailed = !p.successful; + ParseTree[] filterChildren(ParseTree pt) { ParseTree[] result; @@ -3603,7 +3607,7 @@ mixin template decimateTree() { import std.algorithm : startsWith; - if ( (isRule(child.name)) + if ( (isRule(child.name) && (child.matches.length != 0 || parseFailed)) || (!child.successful && child.children.length == 0) || (!child.successful && child.name.startsWith("or!") && child.children.length > 1) || (!pt.successful && child.successful && child.children.length == 0 && child.failedChild.length > 0)) @@ -3630,7 +3634,7 @@ mixin template decimateTree() filterFailedChildren(child); import std.algorithm : startsWith; - if ( (isRule(child.name)) + if ( (isRule(child.name) && (child.matches.length != 0 || parseFailed)) || (!child.successful && child.children.length == 0) || (!child.successful && child.name.startsWith("or!") && child.children.length > 1) || (!pt.successful && child.successful && child.children.length == 0 && child.failedChild.length > 0)) From c6d1a03faae18c961bc0d6cd4f05e5973e8e29f1 Mon Sep 17 00:00:00 2001 From: Sebastiaan Koppe Date: Wed, 5 Feb 2020 17:40:21 +0100 Subject: [PATCH 6/6] fix expected string length bug --- pegged/peg.d | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pegged/peg.d b/pegged/peg.d index 3a19400a..528a7bad 100644 --- a/pegged/peg.d +++ b/pegged/peg.d @@ -1656,15 +1656,15 @@ template or(rules...) if (rules.length > 0) failedLength[i] = temp.end; if (temp.end >= longestFail.end) { + if (temp.end == longestFail.end) + errorStringChars += (temp.matches.length > 0 ? temp.matches[$-1].length : 0) + errName.length + 4; + else + errorStringChars = (temp.matches.length > 0 ? temp.matches[$-1].length : 0) + errName.length + 4; maxFailedLength = temp.end; longestFail = temp; names[i] = errName; results[i] = temp; - if (temp.end == longestFail.end) - errorStringChars += (temp.matches.length > 0 ? temp.matches[$-1].length : 0) + errName.length + 4; - else - errorStringChars = (temp.matches.length > 0 ? temp.matches[$-1].length : 0) + errName.length + 4; } // Else, this error parsed less input than another one: we discard it. }