diff --git a/src/lexer.cc b/src/lexer.cc index d698029..ab63ba3 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -886,6 +886,7 @@ std::optional NLexer::_regexp() { advance(-1); // reset capture indices nested_index = 0; + inside_index = 0; while (branch_reset_indices.size()) branch_reset_indices.pop(); @@ -1228,6 +1229,7 @@ std::optional NLexer::regexp_expression() { RegexpType::SubExprCall, (char)backrefnum, regexp_debug_info(this, "\\g", 2)}; reg.subexprcall = backrefnum; + reg.inside_subexpr = inside_index; return reg; } lexer_error(*this, Errors::InvalidRegexpSyntax, error_token(), @@ -1463,6 +1465,7 @@ std::optional NLexer::regexp_expression() { // parenthesised expression if (c == '(') { nested_index++; + inside_index++; std::optional reset_branch{}; int branch = 0; if (!branch_reset_indices.empty()) { @@ -1492,6 +1495,7 @@ std::optional NLexer::regexp_expression() { } } advance(1); // consume ')' + inside_index--; if (seen_newline) { const Token &mtoken = error_token(); lexer_error(*this, Errors::InvalidRegexpSyntax, mtoken, @@ -1511,6 +1515,7 @@ std::optional NLexer::regexp_expression() { // what _this_ index is branch_reset_indices.push(nested_index); auto reg = regexp(); + inside_index--; if (reset_branch.has_value()) { nested_index = *reset_branch; branch_reset_indices.push(branch); @@ -1540,6 +1545,7 @@ std::optional NLexer::regexp_expression() { nested_index = *reset_branch; branch_reset_indices.push(branch); } + inside_index--; return {}; } if (c == '<' || c == '=') { @@ -1550,6 +1556,7 @@ std::optional NLexer::regexp_expression() { nested_index = *reset_branch; branch_reset_indices.push(branch); } + inside_index--; return {}; } if (c == '>') { @@ -1560,6 +1567,7 @@ std::optional NLexer::regexp_expression() { nested_index = *reset_branch; branch_reset_indices.push(branch); } + inside_index--; return {}; } if (c == ':') { @@ -1572,6 +1580,7 @@ std::optional NLexer::regexp_expression() { if (!reg.has_value()) return reg; auto &rv = reg.value(); + inside_index--; c = *source_p; if (c != ')') { const Token &mtoken = error_token(); @@ -1589,6 +1598,7 @@ std::optional NLexer::regexp_expression() { } auto my_index = nested_index; auto reg = regexp(); + inside_index--; if (reset_branch.has_value()) { nested_index = *reset_branch; branch_reset_indices.push(branch); @@ -2068,6 +2078,7 @@ Regexp::compile(std::multimap *> &cache, NFANode *tl = new NFANode{"R<>" + mangle()}; parent->epsilon_transition_to(tl); tl->subexpr_call = subexprcall; + tl->inside_subexpr = inside_subexpr; tl->named_rule = namef; result = tl; result->debug_info = debug_info; diff --git a/src/lexer.hpp b/src/lexer.hpp index 5ad5352..49c566d 100644 --- a/src/lexer.hpp +++ b/src/lexer.hpp @@ -209,6 +209,7 @@ class NLexer { int offset; char buffer[1024000]; int nested_index = 0; + int inside_index = 0; /// If there's anything on this, reset index to it /// when matching alternatives std::stack branch_reset_indices{}; diff --git a/src/nfa.hpp b/src/nfa.hpp index 2f0c312..3bfc198 100644 --- a/src/nfa.hpp +++ b/src/nfa.hpp @@ -70,6 +70,7 @@ template class DFANode { std::vector assertions = {}; std::set subexpr_idxs = {}; std::set subexpr_end_idxs = {}; + int inside_subexpr = -1; std::optional backreference{}; int subexpr_call = -1; bool subexpr_recurses = false; @@ -128,6 +129,7 @@ template class NFANode { subexpr_end = false, reference_node = false; int max_opt_steps = 50; int opt_step = max_opt_steps; + int inside_subexpr = -1; std::optional inline_code = {}; // code that would be executed should this node match @@ -150,7 +152,6 @@ template class NFANode { int subexpr_idx = -1; int subexpr_end_idx = -1; int subexpr_call = -1; - bool subexpr_recurses = false; NFANode(StateInfoT s) : state_info(s) {} NFANode() {} diff --git a/src/parser.cc b/src/parser.cc index 5c7d1e4..8d43219 100644 --- a/src/parser.cc +++ b/src/parser.cc @@ -41,6 +41,8 @@ inline static void parser_error_impl(char const *fmt, va_list arg) { std::vprintf(fmt, arg); } +static int sexpr_being_built = 0; + char *parser_errors[(int)ParserErrors::LAST - 10] = { [(int)ParserErrors::InvalidToken - 11] = "Invalid token", [(int)ParserErrors::FeatureUnsupported - 11] = "Unsupported feature", @@ -1298,6 +1300,7 @@ template DFANode *>> *NFANode::to_dfa() { } dfanode->subexpr_recurses = dfanode->subexpr_call <= dfanode->subexpr_end_idxs.size(); + dfanode->inside_subexpr = s->inside_subexpr; dfanode->subexpr_call = s->subexpr_call; } if (s->backreference.has_value()) { @@ -1539,6 +1542,8 @@ void DFANLVMCodeGenerator::generate( for (auto subexpr_idx : node->subexpr_idxs) { if (!subexprFunc.count(subexpr_idx)) continue; + int sbb = sexpr_being_built; + sexpr_being_built = subexpr_idx; decltype(visited) _visited; typename std::remove_reference::type _blocks; auto scope = subexprFunc[subexpr_idx]; @@ -1585,6 +1590,7 @@ void DFANLVMCodeGenerator::generate( dbuilder.CreateCondBr(matched, mroot, builder.module.BBfinalise); builder.module.exit_main(); + sexpr_being_built = sbb; } } builder.issubexp = wasub; @@ -1913,7 +1919,7 @@ void DFANLVMCodeGenerator::generate( } // if there is a subexpr call, create it now if (node->subexpr_call > -1 && - (node->subexpr_recurses || node->subexpr_call > subexprFunc.size())) { + (node->subexpr_recurses || sexpr_being_built < node->subexpr_call)) { llvm::Function *fn; auto val = builder.module.current_main()->arg_begin(); if (subexprFunc.count(node->subexpr_call)) diff --git a/src/regexp.hpp b/src/regexp.hpp index b54d2ef..4ce92b8 100644 --- a/src/regexp.hpp +++ b/src/regexp.hpp @@ -52,6 +52,7 @@ class Regexp { bool plus = false, star = false, lazy = false, store = false; int index = 0; // applies for nested and backref (escape) int subexprcall = -1; // applies for SubExprCall + int inside_subexpr = -1; // applies for SubExprCall std::optional repeat; diff --git a/tests/inputs/0012-subexpr-expr.input b/tests/inputs/0012-subexpr-expr.input new file mode 100644 index 0000000..743023b --- /dev/null +++ b/tests/inputs/0012-subexpr-expr.input @@ -0,0 +1 @@ +testhelloooohellotest diff --git a/tests/list-tests b/tests/list-tests index 6df667e..1999864 100644 --- a/tests/list-tests +++ b/tests/list-tests @@ -9,3 +9,4 @@ 0009-pos 0010-pl 0011-subexpr +0012-subexpr-expr diff --git a/tests/outputs/0008-literal-match.stdout b/tests/outputs/0008-literal-match.stdout index ed639ee..8c25f93 100644 --- a/tests/outputs/0008-literal-match.stdout +++ b/tests/outputs/0008-literal-match.stdout @@ -1,4 +1,4 @@ -res at 0x7ffe8df3e678, s at 0x7ff892c2e010 +res at 0x7ffc26eda248, s at 0x7fc1109a1010 processing - 'HELP😒 ' match {'HELP' - (null) - 4 literal} is a stopword diff --git a/tests/outputs/0011-subexpr.stdout b/tests/outputs/0011-subexpr.stdout index 6df7ef1..6d256bd 100644 --- a/tests/outputs/0011-subexpr.stdout +++ b/tests/outputs/0011-subexpr.stdout @@ -1,4 +1,4 @@ -res at 0x7ffdeebc6688, s at 0x7fcaae9ea010 +res at 0x7ffd5f1a3468, s at 0x7ff8bcfe3010 processing - 'testtest' match {'testtest' - (null) - 8 expr} is not a stopword no match {'' - (null) - 0 expr} is not a stopword diff --git a/tests/outputs/0012-subexpr-expr.stderr b/tests/outputs/0012-subexpr-expr.stderr new file mode 100644 index 0000000..e69de29 diff --git a/tests/outputs/0012-subexpr-expr.stdout b/tests/outputs/0012-subexpr-expr.stdout new file mode 100644 index 0000000..501a330 --- /dev/null +++ b/tests/outputs/0012-subexpr-expr.stdout @@ -0,0 +1,5 @@ +res at 0x7ffc99366f18, s at 0x7fc02f781010 +processing - 'testhelloooohellotest' +match {'testhelloooo' - (null) - 12 expr} is not a stopword +match {'hellotest' - (null) - 9 expr} is not a stopword +no match {'' - (null) - 0 expr} is not a stopword diff --git a/tests/sources/0012-subexpr-expr.nlex b/tests/sources/0012-subexpr-expr.nlex new file mode 100644 index 0000000..12b016d --- /dev/null +++ b/tests/sources/0012-subexpr-expr.nlex @@ -0,0 +1,2 @@ +expr :: (test|hello+)\g1 +