Skip to content

Commit

Permalink
LibRegex: Partially implement the ECMAScript unicodeSets proposal
Browse files Browse the repository at this point in the history
This skips the new string unicode properties additions, along with \q{}.
  • Loading branch information
alimpfard authored and linusg committed Jul 20, 2022
1 parent 7734914 commit 598dc74
Show file tree
Hide file tree
Showing 9 changed files with 611 additions and 69 deletions.
39 changes: 39 additions & 0 deletions Tests/LibRegex/Regex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,45 @@ TEST_CASE(ECMA262_unicode_match)
}
}

TEST_CASE(ECMA262_unicode_sets_match)
{
struct _test {
StringView pattern;
StringView subject;
bool matches { true };
ECMAScriptFlags options {};
};

constexpr _test tests[] {
{ "[\\w--x]"sv, "x"sv, false },
{ "[\\w&&x]"sv, "y"sv, false },
{ "[\\w--x]"sv, "y"sv, true },
{ "[\\w&&x]"sv, "x"sv, true },
{ "[[0-9\\w]--x--6]"sv, "6"sv, false },
{ "[[0-9\\w]--x--6]"sv, "x"sv, false },
{ "[[0-9\\w]--x--6]"sv, "y"sv, true },
{ "[[0-9\\w]--x--6]"sv, "9"sv, true },
{ "[\\w&&\\d]"sv, "a"sv, false },
{ "[\\w&&\\d]"sv, "4"sv, true },
};

for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::UnicodeSets | test.options);
if constexpr (REGEX_DEBUG) {
dbgln("\n");
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
dbgln("\n");
}

EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
auto result = re.match(test.subject).success;
EXPECT_EQ(result, test.matches);
}
}

TEST_CASE(ECMA262_property_match)
{
struct _test {
Expand Down
46 changes: 24 additions & 22 deletions Userland/Libraries/LibC/regex.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,25 @@ typedef struct {

enum __Regex_Error {
__Regex_NoError,
__Regex_InvalidPattern, // Invalid regular expression.
__Regex_InvalidCollationElement, // Invalid collating element referenced.
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
__Regex_InvalidNumber, // Number in \digit invalid or in error.
__Regex_MismatchingBracket, // [ ] imbalance.
__Regex_MismatchingParen, // ( ) imbalance.
__Regex_MismatchingBrace, // { } imbalance.
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
__Regex_InvalidBracketContent, // Content of [] invalid.
__Regex_InvalidRange, // Invalid endpoint in range expression.
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
__Regex_EmptySubExpression, // Sub expression has empty content.
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
__Regex_InvalidNameForProperty, // Name of property is invalid.
__Regex_DuplicateNamedCapture, // Duplicate named capture group
__Regex_InvalidPattern, // Invalid regular expression.
__Regex_InvalidCollationElement, // Invalid collating element referenced.
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
__Regex_InvalidNumber, // Number in \digit invalid or in error.
__Regex_MismatchingBracket, // [ ] imbalance.
__Regex_MismatchingParen, // ( ) imbalance.
__Regex_MismatchingBrace, // { } imbalance.
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
__Regex_InvalidBracketContent, // Content of [] invalid.
__Regex_InvalidRange, // Invalid endpoint in range expression.
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
__Regex_EmptySubExpression, // Sub expression has empty content.
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
__Regex_InvalidNameForProperty, // Name of property is invalid.
__Regex_DuplicateNamedCapture, // Duplicate named capture group
__Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
};

enum ReError {
Expand Down Expand Up @@ -82,10 +83,11 @@ enum __RegexAllFlags {
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
__Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match.
__Regex_Internal_Stateful = __Regex_Global << 15, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 16, // Internal flag; enable browser-specific ECMA262 extensions.
__Regex_Internal_ConsiderNewline = __Regex_Global << 17, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Last = __Regex_SingleMatch
__Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes.
__Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
__Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Last = __Regex_UnicodeSets,
};

// Values for the cflags parameter to the regcomp() function:
Expand Down
118 changes: 99 additions & 19 deletions Userland/Libraries/LibRegex/RegexByteCode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,20 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
bool inverse { false };
bool temporary_inverse { false };
bool reset_temp_inverse { false };
struct DisjunctionState {
bool active { false };
bool is_conjunction { false };
bool fail { false };
size_t initial_position;
size_t initial_code_unit_position;
Optional<size_t> last_accepted_position {};
Optional<size_t> last_accepted_code_unit_position {};
};

Vector<DisjunctionState, 4> disjunction_states;
disjunction_states.empend();

auto current_disjunction_state = [&]() -> DisjunctionState& { return disjunction_states.last(); };

auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };

Expand Down Expand Up @@ -602,16 +616,69 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);

} else if (compare_type == CharacterCompareType::And) {
disjunction_states.append({
.active = true,
.is_conjunction = false,
.fail = false,
.initial_position = state.string_position,
.initial_code_unit_position = state.string_position_in_code_units,
});
continue;

} else if (compare_type == CharacterCompareType::Or) {
disjunction_states.append({
.active = true,
.is_conjunction = true,
.fail = true,
.initial_position = state.string_position,
.initial_code_unit_position = state.string_position_in_code_units,
});
continue;

} else if (compare_type == CharacterCompareType::EndAndOr) {
auto disjunction_state = disjunction_states.take_last();
if (!disjunction_state.fail) {
state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
}

} else {
warnln("Undefined comparison: {}", (int)compare_type);
VERIFY_NOT_REACHED();
break;
}

if (current_inversion_state() && !inverse && !inverse_matched) {
auto& new_disjunction_state = current_disjunction_state();
if (current_inversion_state() && (!inverse || new_disjunction_state.active) && !inverse_matched) {
advance_string_position(state, input.view);
inverse_matched = true;
}

if (new_disjunction_state.active) {
auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();

if (!failed) {
new_disjunction_state.last_accepted_position = state.string_position;
new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units;
}

if (new_disjunction_state.is_conjunction)
new_disjunction_state.fail = failed && new_disjunction_state.fail;
else
new_disjunction_state.fail = failed || new_disjunction_state.fail;

state.string_position = new_disjunction_state.initial_position;
state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
}
}

auto& new_disjunction_state = current_disjunction_state();
if (new_disjunction_state.active) {
if (!new_disjunction_state.fail) {
state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
}
}

if (current_inversion_state() && !inverse_matched)
Expand Down Expand Up @@ -843,6 +910,12 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
auto count = m_bytecode->at(offset++);
for (size_t i = 0; i < count; ++i)
result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
} else if (compare_type == CharacterCompareType::GeneralCategory
|| compare_type == CharacterCompareType::Property
|| compare_type == CharacterCompareType::Script
|| compare_type == CharacterCompareType::ScriptExtension) {
auto value = m_bytecode->at(offset++);
result.append({ compare_type, value });
} else {
result.append({ compare_type, 0 });
}
Expand All @@ -867,75 +940,82 @@ Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput>
auto ch = m_bytecode->at(offset++);
auto is_ascii = is_ascii_printable(ch);
if (is_ascii)
result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
result.empend(String::formatted(" value='{:c}'", static_cast<char>(ch)));
else
result.empend(String::formatted("value={:x}", ch));
result.empend(String::formatted(" value={:x}", ch));

if (!view.is_null() && view.length() > string_start_offset) {
if (is_ascii) {
result.empend(String::formatted(
"compare against: '{}'",
" compare against: '{}'",
view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string()));
} else {
auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string();
u8 buf[8] { 0 };
__builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
result.empend(String::formatted(" compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
}
}
} else if (compare_type == CharacterCompareType::Reference) {
auto ref = m_bytecode->at(offset++);
result.empend(String::formatted("number={}", ref));
result.empend(String::formatted(" number={}", ref));
if (input.has_value()) {
if (state().capture_group_matches.size() > input->match_index) {
auto& match = state().capture_group_matches[input->match_index];
if (match.size() > ref) {
auto& group = match[ref];
result.empend(String::formatted("left={}", group.left_column));
result.empend(String::formatted("right={}", group.left_column + group.view.length_in_code_units()));
result.empend(String::formatted("contents='{}'", group.view));
result.empend(String::formatted(" left={}", group.left_column));
result.empend(String::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
result.empend(String::formatted(" contents='{}'", group.view));
} else {
result.empend(String::formatted("(invalid ref, max={})", match.size() - 1));
result.empend(String::formatted(" (invalid ref, max={})", match.size() - 1));
}
} else {
result.empend(String::formatted("(invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1));
result.empend(String::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1));
}
}
} else if (compare_type == CharacterCompareType::String) {
auto& length = m_bytecode->at(offset++);
StringBuilder str_builder;
for (size_t i = 0; i < length; ++i)
str_builder.append(m_bytecode->at(offset++));
result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
result.empend(String::formatted(" value=\"{}\"", str_builder.string_view().substring_view(0, length)));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::formatted(
"compare against: \"{}\"",
" compare against: \"{}\"",
input.value().view.substring_view(string_start_offset, string_start_offset + length > view.length() ? 0 : length).to_string()));
} else if (compare_type == CharacterCompareType::CharClass) {
auto character_class = (CharClass)m_bytecode->at(offset++);
result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
result.empend(String::formatted(" ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::formatted(
"compare against: '{}'",
" compare against: '{}'",
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = (CharRange)m_bytecode->at(offset++);
result.empend(String::formatted("ch_range={:x}-{:x}", value.from, value.to));
result.empend(String::formatted(" ch_range={:x}-{:x}", value.from, value.to));
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::formatted(
"compare against: '{}'",
" compare against: '{}'",
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
} else if (compare_type == CharacterCompareType::LookupTable) {
auto count = m_bytecode->at(offset++);
for (size_t j = 0; j < count; ++j) {
auto range = (CharRange)m_bytecode->at(offset++);
result.append(String::formatted("{:x}-{:x}", range.from, range.to));
result.append(String::formatted(" {:x}-{:x}", range.from, range.to));
}
if (!view.is_null() && view.length() > state().string_position)
result.empend(String::formatted(
"compare against: '{}'",
" compare against: '{}'",
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
} else if (compare_type == CharacterCompareType::GeneralCategory
|| compare_type == CharacterCompareType::Property
|| compare_type == CharacterCompareType::Script
|| compare_type == CharacterCompareType::ScriptExtension) {

auto value = m_bytecode->at(offset++);
result.empend(String::formatted(" value={}", value));
}
}
return result;
Expand Down
5 changes: 4 additions & 1 deletion Userland/Libraries/LibRegex/RegexByteCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ enum class OpCodeId : ByteCodeValueType {
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)
__ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(And) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)

enum class CharacterCompareType : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
Expand Down
Loading

0 comments on commit 598dc74

Please sign in to comment.