Skip to content

Commit

Permalink
LibRegex: Refactor parsing 'CharacterEscape' out of 'AtomEscape'
Browse files Browse the repository at this point in the history
The ECMA262 spec has this as a separate production, and we need it to be
split up for a future commit.
  • Loading branch information
alimpfard authored and linusg committed Jul 20, 2022
1 parent b908f9f commit 7734914
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 41 deletions.
93 changes: 52 additions & 41 deletions Userland/Libraries/LibRegex/RegexParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1428,59 +1428,34 @@ bool ECMA262Parser::parse_invalid_braced_quantifier()

bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& compares, size_t& match_length_minimum, ParseFlags flags)
{
if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
if (auto escape = escape_str.to_uint(); escape.has_value()) {
// See if this is a "back"-reference (we've already parsed the group it refers to)
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
if (maybe_length.has_value()) {
match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
return true;
}
// It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
// This refers to a future group, and it will _always_ be matching an empty string
// So just match nothing and move on.
return true;
}
if (!m_should_use_browser_extended_grammar) {
set_error(Error::InvalidNumber);
return false;
}
}

// If not, put the characters back.
back(escape_str.length());
}

// CharacterEscape > ControlEscape
if (try_skip("f"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\f' });
return true;
}

if (try_skip("n"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\n' });
return true;
}

if (try_skip("r"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\r' });
return true;
}

if (try_skip("t"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\t' });
return true;
}

if (try_skip("v"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\v' });
return true;
}

Expand All @@ -1489,7 +1464,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
for (auto c : s_alphabetic_characters) {
if (try_skip({ &c, 1 })) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)(c % 32) });
return true;
}
}
Expand All @@ -1500,23 +1475,23 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
}

if (m_should_use_browser_extended_grammar) {
back(1 + !done());
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
back(1 + (done() ? 0 : 1));
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'\\' });
match_length_minimum += 1;
return true;
}

// Allow '\c' in non-unicode mode, just matches 'c'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'c' });
return true;
}

// '\0'
if (try_skip("0"sv)) {
if (!lookahead_any(s_decimal_characters)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)0 });
return true;
}

Expand All @@ -1527,7 +1502,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (m_should_use_browser_extended_grammar) {
if (!flags.unicode) {
if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)escape.value() });
match_length_minimum += 1;
return true;
}
Expand All @@ -1538,13 +1513,13 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (try_skip("x"sv)) {
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() });
return true;
}
if (!flags.unicode) {
// '\x' is allowed in non-unicode mode, just matches 'x'.
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'x' });
return true;
}

Expand All @@ -1555,7 +1530,7 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
if (try_skip("u"sv)) {
if (auto code_point = consume_escaped_code_point(flags.unicode); code_point.has_value()) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)code_point.value() });
return true;
}

Expand All @@ -1566,19 +1541,55 @@ bool ECMA262Parser::parse_character_escape(Vector<CompareTypeAndValuePair>& comp
for (auto ch : identity_escape_characters(flags.unicode, m_should_use_browser_extended_grammar)) {
if (try_skip({ &ch, 1 })) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)ch });
return true;
}
}

if (flags.unicode) {
if (try_skip("/"sv)) {
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
compares.append({ CharacterCompareType::Char, (ByteCodeValueType)'/' });
return true;
}
}

return false;
}

bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags)
{
if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
if (auto escape = escape_str.to_uint(); escape.has_value()) {
// See if this is a "back"-reference (we've already parsed the group it refers to)
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
if (maybe_length.has_value()) {
match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
return true;
}
// It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
// This refers to a future group, and it will _always_ be matching an empty string
// So just match nothing and move on.
return true;
}
if (!m_should_use_browser_extended_grammar) {
set_error(Error::InvalidNumber);
return false;
}
}

// If not, put the characters back.
back(escape_str.length());
}

Vector<CompareTypeAndValuePair> escape_compares;
if (parse_character_escape(escape_compares, match_length_minimum, flags)) {
stack.insert_bytecode_compare_values(move(escape_compares));
return true;
}

if (flags.named && try_skip("k"sv)) {
auto name = read_capture_group_specifier(true);
if (name.is_empty()) {
Expand Down
2 changes: 2 additions & 0 deletions Userland/Libraries/LibRegex/RegexParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ class ECMA262Parser final : public Parser {
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, ParseFlags);
bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);

bool parse_character_escape(Vector<CompareTypeAndValuePair>&, size_t&, ParseFlags);

// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
bool parse_quantifiable_assertion(ByteCode&, size_t&, ParseFlags);
bool parse_extended_atom(ByteCode&, size_t&, ParseFlags);
Expand Down

0 comments on commit 7734914

Please sign in to comment.