LibRegex: Partially implement the ECMAScript unicodeSets proposal

This skips the new string unicode properties additions, along with \q{}.
gmbows · Jul 20, 2022 · 598dc74 · 598dc74
1 parent 7734914
commit 598dc74
Show file tree

Hide file tree

Showing 9 changed files with 611 additions and 69 deletions.
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp
@@ -765,6 +765,45 @@ TEST_CASE(ECMA262_unicode_match)
     }
 }
 
+TEST_CASE(ECMA262_unicode_sets_match)
+{
+    struct _test {
+        StringView pattern;
+        StringView subject;
+        bool matches { true };
+        ECMAScriptFlags options {};
+    };
+
+    constexpr _test tests[] {
+        { "[\\w--x]"sv, "x"sv, false },
+        { "[\\w&&x]"sv, "y"sv, false },
+        { "[\\w--x]"sv, "y"sv, true },
+        { "[\\w&&x]"sv, "x"sv, true },
+        { "[[0-9\\w]--x--6]"sv, "6"sv, false },
+        { "[[0-9\\w]--x--6]"sv, "x"sv, false },
+        { "[[0-9\\w]--x--6]"sv, "y"sv, true },
+        { "[[0-9\\w]--x--6]"sv, "9"sv, true },
+        { "[\\w&&\\d]"sv, "a"sv, false },
+        { "[\\w&&\\d]"sv, "4"sv, true },
+    };
+
+    for (auto& test : tests) {
+        Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::UnicodeSets | test.options);
+        if constexpr (REGEX_DEBUG) {
+            dbgln("\n");
+            RegexDebug regex_dbg(stderr);
+            regex_dbg.print_raw_bytecode(re);
+            regex_dbg.print_header();
+            regex_dbg.print_bytecode(re);
+            dbgln("\n");
+        }
+
+        EXPECT_EQ(re.parser_result.error, regex::Error::NoError);
+        auto result = re.match(test.subject).success;
+        EXPECT_EQ(result, test.matches);
+    }
+}
+
 TEST_CASE(ECMA262_property_match)
 {
     struct _test {

diff --git a/Userland/Libraries/LibC/regex.h b/Userland/Libraries/LibC/regex.h
@@ -21,24 +21,25 @@ typedef struct {
 
 enum __Regex_Error {
     __Regex_NoError,
-    __Regex_InvalidPattern,             // Invalid regular expression.
-    __Regex_InvalidCollationElement,    // Invalid collating element referenced.
-    __Regex_InvalidCharacterClass,      // Invalid character class type referenced.
-    __Regex_InvalidTrailingEscape,      // Trailing \ in pattern.
-    __Regex_InvalidNumber,              // Number in \digit invalid or in error.
-    __Regex_MismatchingBracket,         // [ ] imbalance.
-    __Regex_MismatchingParen,           // ( ) imbalance.
-    __Regex_MismatchingBrace,           // { } imbalance.
-    __Regex_InvalidBraceContent,        // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
-    __Regex_InvalidBracketContent,      // Content of [] invalid.
-    __Regex_InvalidRange,               // Invalid endpoint in range expression.
-    __Regex_InvalidRepetitionMarker,    // ?, * or + not preceded by valid regular expression.
-    __Regex_ReachedMaxRecursion,        // MaximumRecursion has been reached.
-    __Regex_EmptySubExpression,         // Sub expression has empty content.
-    __Regex_InvalidCaptureGroup,        // Content of capture group is invalid.
-    __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
-    __Regex_InvalidNameForProperty,     // Name of property is invalid.
-    __Regex_DuplicateNamedCapture,      // Duplicate named capture group
+    __Regex_InvalidPattern,              // Invalid regular expression.
+    __Regex_InvalidCollationElement,     // Invalid collating element referenced.
+    __Regex_InvalidCharacterClass,       // Invalid character class type referenced.
+    __Regex_InvalidTrailingEscape,       // Trailing \ in pattern.
+    __Regex_InvalidNumber,               // Number in \digit invalid or in error.
+    __Regex_MismatchingBracket,          // [ ] imbalance.
+    __Regex_MismatchingParen,            // ( ) imbalance.
+    __Regex_MismatchingBrace,            // { } imbalance.
+    __Regex_InvalidBraceContent,         // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
+    __Regex_InvalidBracketContent,       // Content of [] invalid.
+    __Regex_InvalidRange,                // Invalid endpoint in range expression.
+    __Regex_InvalidRepetitionMarker,     // ?, * or + not preceded by valid regular expression.
+    __Regex_ReachedMaxRecursion,         // MaximumRecursion has been reached.
+    __Regex_EmptySubExpression,          // Sub expression has empty content.
+    __Regex_InvalidCaptureGroup,         // Content of capture group is invalid.
+    __Regex_InvalidNameForCaptureGroup,  // Name of capture group is invalid.
+    __Regex_InvalidNameForProperty,      // Name of property is invalid.
+    __Regex_DuplicateNamedCapture,       // Duplicate named capture group
+    __Regex_InvalidCharacterClassEscape, // Invalid escaped entity in character class.
 };
 
 enum ReError {
@@ -82,10 +83,11 @@ enum __RegexAllFlags {
     __Regex_Multiline = __Regex_Global << 12,                // Handle newline characters. Match each line, one by one.
     __Regex_SkipTrimEmptyMatches = __Regex_Global << 13,     // Do not remove empty capture group results.
     __Regex_SingleMatch = __Regex_Global << 14,              // Stop after acquiring a single match.
-    __Regex_Internal_Stateful = __Regex_Global << 15,        // Internal flag; enables stateful matches.
-    __Regex_Internal_BrowserExtended = __Regex_Global << 16, // Internal flag; enable browser-specific ECMA262 extensions.
-    __Regex_Internal_ConsiderNewline = __Regex_Global << 17, // Internal flag; allow matchers to consider newlines as line separators.
-    __Regex_Last = __Regex_SingleMatch
+    __Regex_UnicodeSets = __Regex_Global << 15,              // ECMA262 Parser specific: Allow set operations in char classes.
+    __Regex_Internal_Stateful = __Regex_Global << 16,        // Internal flag; enables stateful matches.
+    __Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
+    __Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
+    __Regex_Last = __Regex_UnicodeSets,
 };
 
 // Values for the cflags parameter to the regcomp() function:

diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -435,6 +435,20 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
     bool inverse { false };
     bool temporary_inverse { false };
     bool reset_temp_inverse { false };
+    struct DisjunctionState {
+        bool active { false };
+        bool is_conjunction { false };
+        bool fail { false };
+        size_t initial_position;
+        size_t initial_code_unit_position;
+        Optional<size_t> last_accepted_position {};
+        Optional<size_t> last_accepted_code_unit_position {};
+    };
+
+    Vector<DisjunctionState, 4> disjunction_states;
+    disjunction_states.empend();
+
+    auto current_disjunction_state = [&]() -> DisjunctionState& { return disjunction_states.last(); };
 
     auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
 
@@ -602,16 +616,69 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
             auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
             compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
 
+        } else if (compare_type == CharacterCompareType::And) {
+            disjunction_states.append({
+                .active = true,
+                .is_conjunction = false,
+                .fail = false,
+                .initial_position = state.string_position,
+                .initial_code_unit_position = state.string_position_in_code_units,
+            });
+            continue;
+
+        } else if (compare_type == CharacterCompareType::Or) {
+            disjunction_states.append({
+                .active = true,
+                .is_conjunction = true,
+                .fail = true,
+                .initial_position = state.string_position,
+                .initial_code_unit_position = state.string_position_in_code_units,
+            });
+            continue;
+
+        } else if (compare_type == CharacterCompareType::EndAndOr) {
+            auto disjunction_state = disjunction_states.take_last();
+            if (!disjunction_state.fail) {
+                state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position);
+                state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position);
+            }
+
         } else {
             warnln("Undefined comparison: {}", (int)compare_type);
             VERIFY_NOT_REACHED();
             break;
         }
 
-        if (current_inversion_state() && !inverse && !inverse_matched) {
+        auto& new_disjunction_state = current_disjunction_state();
+        if (current_inversion_state() && (!inverse || new_disjunction_state.active) && !inverse_matched) {
             advance_string_position(state, input.view);
             inverse_matched = true;
         }
+
+        if (new_disjunction_state.active) {
+            auto failed = (!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length();
+
+            if (!failed) {
+                new_disjunction_state.last_accepted_position = state.string_position;
+                new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units;
+            }
+
+            if (new_disjunction_state.is_conjunction)
+                new_disjunction_state.fail = failed && new_disjunction_state.fail;
+            else
+                new_disjunction_state.fail = failed || new_disjunction_state.fail;
+
+            state.string_position = new_disjunction_state.initial_position;
+            state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position;
+        }
+    }
+
+    auto& new_disjunction_state = current_disjunction_state();
+    if (new_disjunction_state.active) {
+        if (!new_disjunction_state.fail) {
+            state.string_position = new_disjunction_state.last_accepted_position.value_or(new_disjunction_state.initial_position);
+            state.string_position_in_code_units = new_disjunction_state.last_accepted_code_unit_position.value_or(new_disjunction_state.initial_code_unit_position);
+        }
     }
 
     if (current_inversion_state() && !inverse_matched)
@@ -843,6 +910,12 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
             auto count = m_bytecode->at(offset++);
             for (size_t i = 0; i < count; ++i)
                 result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
+        } else if (compare_type == CharacterCompareType::GeneralCategory
+            || compare_type == CharacterCompareType::Property
+            || compare_type == CharacterCompareType::Script
+            || compare_type == CharacterCompareType::ScriptExtension) {
+            auto value = m_bytecode->at(offset++);
+            result.append({ compare_type, value });
         } else {
             result.append({ compare_type, 0 });
         }
@@ -867,75 +940,82 @@ Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput>
             auto ch = m_bytecode->at(offset++);
             auto is_ascii = is_ascii_printable(ch);
             if (is_ascii)
-                result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
+                result.empend(String::formatted(" value='{:c}'", static_cast<char>(ch)));
             else
-                result.empend(String::formatted("value={:x}", ch));
+                result.empend(String::formatted(" value={:x}", ch));
 
             if (!view.is_null() && view.length() > string_start_offset) {
                 if (is_ascii) {
                     result.empend(String::formatted(
-                        "compare against: '{}'",
+                        " compare against: '{}'",
                         view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string()));
                 } else {
                     auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string();
                     u8 buf[8] { 0 };
                     __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
-                    result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
+                    result.empend(String::formatted(" compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
                         buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
                 }
             }
         } else if (compare_type == CharacterCompareType::Reference) {
             auto ref = m_bytecode->at(offset++);
-            result.empend(String::formatted("number={}", ref));
+            result.empend(String::formatted(" number={}", ref));
             if (input.has_value()) {
                 if (state().capture_group_matches.size() > input->match_index) {
                     auto& match = state().capture_group_matches[input->match_index];
                     if (match.size() > ref) {
                         auto& group = match[ref];
-                        result.empend(String::formatted("left={}", group.left_column));
-                        result.empend(String::formatted("right={}", group.left_column + group.view.length_in_code_units()));
-                        result.empend(String::formatted("contents='{}'", group.view));
+                        result.empend(String::formatted(" left={}", group.left_column));
+                        result.empend(String::formatted(" right={}", group.left_column + group.view.length_in_code_units()));
+                        result.empend(String::formatted(" contents='{}'", group.view));
                     } else {
-                        result.empend(String::formatted("(invalid ref, max={})", match.size() - 1));
+                        result.empend(String::formatted(" (invalid ref, max={})", match.size() - 1));
                     }
                 } else {
-                    result.empend(String::formatted("(invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1));
+                    result.empend(String::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1));
                 }
             }
         } else if (compare_type == CharacterCompareType::String) {
             auto& length = m_bytecode->at(offset++);
             StringBuilder str_builder;
             for (size_t i = 0; i < length; ++i)
                 str_builder.append(m_bytecode->at(offset++));
-            result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
+            result.empend(String::formatted(" value=\"{}\"", str_builder.string_view().substring_view(0, length)));
             if (!view.is_null() && view.length() > state().string_position)
                 result.empend(String::formatted(
-                    "compare against: \"{}\"",
+                    " compare against: \"{}\"",
                     input.value().view.substring_view(string_start_offset, string_start_offset + length > view.length() ? 0 : length).to_string()));
         } else if (compare_type == CharacterCompareType::CharClass) {
             auto character_class = (CharClass)m_bytecode->at(offset++);
-            result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
+            result.empend(String::formatted(" ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
             if (!view.is_null() && view.length() > state().string_position)
                 result.empend(String::formatted(
-                    "compare against: '{}'",
+                    " compare against: '{}'",
                     input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
         } else if (compare_type == CharacterCompareType::CharRange) {
             auto value = (CharRange)m_bytecode->at(offset++);
-            result.empend(String::formatted("ch_range={:x}-{:x}", value.from, value.to));
+            result.empend(String::formatted(" ch_range={:x}-{:x}", value.from, value.to));
             if (!view.is_null() && view.length() > state().string_position)
                 result.empend(String::formatted(
-                    "compare against: '{}'",
+                    " compare against: '{}'",
                     input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
         } else if (compare_type == CharacterCompareType::LookupTable) {
             auto count = m_bytecode->at(offset++);
             for (size_t j = 0; j < count; ++j) {
                 auto range = (CharRange)m_bytecode->at(offset++);
-                result.append(String::formatted("{:x}-{:x}", range.from, range.to));
+                result.append(String::formatted(" {:x}-{:x}", range.from, range.to));
             }
             if (!view.is_null() && view.length() > state().string_position)
                 result.empend(String::formatted(
-                    "compare against: '{}'",
+                    " compare against: '{}'",
                     input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
+        } else if (compare_type == CharacterCompareType::GeneralCategory
+            || compare_type == CharacterCompareType::Property
+            || compare_type == CharacterCompareType::Script
+            || compare_type == CharacterCompareType::ScriptExtension) {
+
+            auto value = m_bytecode->at(offset++);
+            result.empend(String::formatted(" value={}", value));
         }
     }
     return result;

diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -76,7 +76,10 @@ enum class OpCodeId : ByteCodeValueType {
     __ENUMERATE_CHARACTER_COMPARE_TYPE(Script)               \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension)      \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)          \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(And)                  \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Or)                   \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)
 
 enum class CharacterCompareType : ByteCodeValueType {
 #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,