Explorar o código

LibRegex: Use a match table for character classes

Generate a sorted, compressed series of ranges in a match table for
character classes, and use a binary search to find the matches.
This is about a 3-4x speedup for character class match performance. :^)
Ali Mohammad Pur %!s(int64=3) %!d(string=hai) anos
pai
achega
8f722302d9

+ 18 - 0
Tests/LibRegex/Regex.cpp

@@ -910,3 +910,21 @@ TEST_CASE(optimizer_atomic_groups)
         EXPECT_EQ(result.success, test.get<2>());
         EXPECT_EQ(result.success, test.get<2>());
     }
     }
 }
 }
+
+TEST_CASE(optimizer_char_class_lut)
+{
+    Regex<ECMA262> re(R"([\f\n\r\t\v\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000\ufeff]+$)");
+
+    if constexpr (REGEX_DEBUG) {
+        dbgln("\n");
+        RegexDebug regex_dbg(stderr);
+        regex_dbg.print_raw_bytecode(re);
+        regex_dbg.print_header();
+        regex_dbg.print_bytecode(re);
+        dbgln("\n");
+    }
+
+    // This will go through _all_ alternatives in the character class, and then fail.
+    for (size_t i = 0; i < 1'000'000; ++i)
+        EXPECT_EQ(re.match("1635488940000"sv).success, false);
+}

+ 2 - 0
Userland/Libraries/LibRegex/Forward.h

@@ -9,6 +9,8 @@
 #include <AK/Types.h>
 #include <AK/Types.h>
 
 
 namespace regex {
 namespace regex {
+struct CompareTypeAndValuePair;
+
 enum class Error : u8;
 enum class Error : u8;
 class Lexer;
 class Lexer;
 class PosixExtendedParser;
 class PosixExtendedParser;

+ 47 - 0
Userland/Libraries/LibRegex/RegexByteCode.cpp

@@ -7,6 +7,7 @@
 #include "RegexByteCode.h"
 #include "RegexByteCode.h"
 #include "AK/StringBuilder.h"
 #include "AK/StringBuilder.h"
 #include "RegexDebug.h"
 #include "RegexDebug.h"
+#include <AK/BinarySearch.h>
 #include <AK/CharacterTypes.h>
 #include <AK/CharacterTypes.h>
 #include <AK/Debug.h>
 #include <AK/Debug.h>
 #include <LibUnicode/CharacterTypes.h>
 #include <LibUnicode/CharacterTypes.h>
@@ -491,6 +492,38 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
 
 
             compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
             compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
 
 
+        } else if (compare_type == CharacterCompareType::LookupTable) {
+            if (input.view.length() <= state.string_position)
+                return ExecutionResult::Failed_ExecuteLowPrioForks;
+
+            auto count = m_bytecode->at(offset++);
+            auto range_data = m_bytecode->spans().slice(offset, count);
+            offset += count;
+
+            auto ch = input.view.substring_view(state.string_position, 1)[0];
+
+            auto matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) {
+                auto from = range.from;
+                auto to = range.to;
+                if (insensitive) {
+                    from = to_ascii_lowercase(from);
+                    to = to_ascii_lowercase(to);
+                    needle = to_ascii_lowercase(needle);
+                }
+                if (needle > range.to)
+                    return 1;
+                if (needle < range.from)
+                    return -1;
+                return 0;
+            });
+
+            if (matching_range) {
+                if (current_inversion_state())
+                    inverse_matched = true;
+                else
+                    advance_string_position(state, input.view, ch);
+            }
+
         } else if (compare_type == CharacterCompareType::CharRange) {
         } else if (compare_type == CharacterCompareType::CharRange) {
             if (input.view.length() <= state.string_position)
             if (input.view.length() <= state.string_position)
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
                 return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -816,6 +849,10 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
         } else if (compare_type == CharacterCompareType::CharRange) {
         } else if (compare_type == CharacterCompareType::CharRange) {
             auto value = m_bytecode->at(offset++);
             auto value = m_bytecode->at(offset++);
             result.append({ compare_type, value });
             result.append({ compare_type, value });
+        } else if (compare_type == CharacterCompareType::LookupTable) {
+            auto count = m_bytecode->at(offset++);
+            for (size_t i = 0; i < count; ++i)
+                result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
         } else {
         } else {
             result.append({ compare_type, 0 });
             result.append({ compare_type, 0 });
         }
         }
@@ -884,6 +921,16 @@ Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<Match
                 result.empend(String::formatted(
                 result.empend(String::formatted(
                     "compare against: '{}'",
                     "compare against: '{}'",
                     input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
                     input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
+        } else if (compare_type == CharacterCompareType::LookupTable) {
+            auto count = m_bytecode->at(offset++);
+            for (size_t j = 0; j < count; ++j) {
+                auto range = (CharRange)m_bytecode->at(offset++);
+                result.append(String::formatted("{:x}-{:x}", range.from, range.to));
+            }
+            if (!view.is_null() && view.length() > state().string_position)
+                result.empend(String::formatted(
+                    "compare against: '{}'",
+                    input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
         }
         }
     }
     }
     return result;
     return result;

+ 17 - 35
Userland/Libraries/LibRegex/RegexByteCode.h

@@ -61,21 +61,22 @@ enum class OpCodeId : ByteCodeValueType {
 };
 };
 // clang-format on
 // clang-format on
 
 
-#define ENUMERATE_CHARACTER_COMPARE_TYPES                \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined)        \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse)          \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar)          \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Char)             \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(String)           \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass)        \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange)        \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference)        \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Property)         \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory)  \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(Script)           \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension)  \
-    __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
+#define ENUMERATE_CHARACTER_COMPARE_TYPES                    \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined)            \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse)              \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse)     \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar)              \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Char)                 \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(String)               \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass)            \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange)            \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference)            \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Property)             \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory)      \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(Script)               \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension)      \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)
 
 
 enum class CharacterCompareType : ByteCodeValueType {
 enum class CharacterCompareType : ByteCodeValueType {
 #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
 #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
@@ -186,26 +187,7 @@ public:
 
 
     void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
     void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
     {
     {
-        ByteCode bytecode;
-
-        bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
-        bytecode.empend(pairs.size()); // number of arguments
-
-        ByteCode arguments;
-        for (auto& value : pairs) {
-            VERIFY(value.type != CharacterCompareType::RangeExpressionDummy);
-            VERIFY(value.type != CharacterCompareType::Undefined);
-            VERIFY(value.type != CharacterCompareType::String);
-
-            arguments.append((ByteCodeValueType)value.type);
-            if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
-                arguments.append(move(value.value));
-        }
-
-        bytecode.empend(arguments.size()); // size of arguments
-        bytecode.extend(move(arguments));
-
-        extend(move(bytecode));
+        Optimizer::append_character_class(*this, move(pairs));
     }
     }
 
 
     void insert_bytecode_check_boundary(BoundaryCheckType type)
     void insert_bytecode_check_boundary(BoundaryCheckType type)

+ 2 - 0
Userland/Libraries/LibRegex/RegexBytecodeStreamOptimizer.h

@@ -7,12 +7,14 @@
 #pragma once
 #pragma once
 
 
 #include "Forward.h"
 #include "Forward.h"
+#include <AK/Vector.h>
 
 
 namespace regex {
 namespace regex {
 
 
 class Optimizer {
 class Optimizer {
 public:
 public:
     static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
     static void append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right);
+    static void append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs);
 };
 };
 
 
 }
 }

+ 146 - 0
Userland/Libraries/LibRegex/RegexOptimizer.cpp

@@ -488,6 +488,152 @@ void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&&
     // LABEL _END = alterantive_bytecode.size
     // LABEL _END = alterantive_bytecode.size
 }
 }
 
 
+enum class LookupTableInsertionOutcome {
+    Successful,
+    ReplaceWithAnyChar,
+    TemporaryInversionNeeded,
+    PermanentInversionNeeded,
+    CannotPlaceInTable,
+};
+static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
+{
+    switch (pair.type) {
+    case CharacterCompareType::Inverse:
+        return LookupTableInsertionOutcome::PermanentInversionNeeded;
+    case CharacterCompareType::TemporaryInverse:
+        return LookupTableInsertionOutcome::TemporaryInversionNeeded;
+    case CharacterCompareType::AnyChar:
+        return LookupTableInsertionOutcome::ReplaceWithAnyChar;
+    case CharacterCompareType::CharClass:
+        return LookupTableInsertionOutcome::CannotPlaceInTable;
+    case CharacterCompareType::Char:
+        table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
+        break;
+    case CharacterCompareType::CharRange: {
+        CharRange range { pair.value };
+        table.insert(range.from, range);
+        break;
+    }
+    case CharacterCompareType::Reference:
+    case CharacterCompareType::Property:
+    case CharacterCompareType::GeneralCategory:
+    case CharacterCompareType::Script:
+    case CharacterCompareType::ScriptExtension:
+        return LookupTableInsertionOutcome::CannotPlaceInTable;
+    case CharacterCompareType::Undefined:
+    case CharacterCompareType::RangeExpressionDummy:
+    case CharacterCompareType::String:
+    case CharacterCompareType::LookupTable:
+        VERIFY_NOT_REACHED();
+    }
+
+    return LookupTableInsertionOutcome::Successful;
+}
+
+void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
+{
+    ByteCode arguments;
+    size_t argument_count = 0;
+
+    if (pairs.size() <= 1) {
+        for (auto& pair : pairs) {
+            arguments.append(to_underlying(pair.type));
+            if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse)
+                arguments.append(pair.value);
+            ++argument_count;
+        }
+    } else {
+        RedBlackTree<ByteCodeValueType, CharRange> table;
+        RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
+        auto* current_table = &table;
+        auto* current_inverted_table = &inverted_table;
+        bool invert_for_next_iteration = false;
+        bool is_currently_inverted = false;
+
+        for (auto& value : pairs) {
+            auto should_invert_after_this_iteration = invert_for_next_iteration;
+            invert_for_next_iteration = false;
+
+            auto insertion_result = insert_into_lookup_table(*current_table, value);
+            switch (insertion_result) {
+            case LookupTableInsertionOutcome::Successful:
+                break;
+            case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
+                table.clear();
+                inverted_table.clear();
+                arguments.append(to_underlying(CharacterCompareType::AnyChar));
+                ++argument_count;
+                break;
+            }
+            case LookupTableInsertionOutcome::TemporaryInversionNeeded:
+                swap(current_table, current_inverted_table);
+                invert_for_next_iteration = true;
+                is_currently_inverted = !is_currently_inverted;
+                break;
+            case LookupTableInsertionOutcome::PermanentInversionNeeded:
+                swap(current_table, current_inverted_table);
+                is_currently_inverted = !is_currently_inverted;
+                break;
+            case LookupTableInsertionOutcome::CannotPlaceInTable:
+                if (is_currently_inverted) {
+                    arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
+                    ++argument_count;
+                }
+                arguments.append(to_underlying(value.type));
+                arguments.append(value.value);
+                ++argument_count;
+                break;
+            }
+
+            if (should_invert_after_this_iteration) {
+                swap(current_table, current_inverted_table);
+                is_currently_inverted = !is_currently_inverted;
+            }
+        }
+        auto append_table = [&](auto& table) {
+            ++argument_count;
+            arguments.append(to_underlying(CharacterCompareType::LookupTable));
+            auto size_index = arguments.size();
+            arguments.append(0);
+            Optional<CharRange> active_range;
+            size_t range_count = 0;
+            for (auto& range : table) {
+                if (!active_range.has_value()) {
+                    active_range = range;
+                    continue;
+                }
+
+                if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
+                    active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
+                } else {
+                    ++range_count;
+                    arguments.append(active_range.release_value());
+                    active_range = range;
+                }
+            }
+            if (active_range.has_value()) {
+                ++range_count;
+                arguments.append(active_range.release_value());
+            }
+            arguments[size_index] = range_count;
+        };
+
+        if (!table.is_empty())
+            append_table(table);
+
+        if (!inverted_table.is_empty()) {
+            ++argument_count;
+            arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
+            append_table(inverted_table);
+        }
+    }
+
+    target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
+    target.empend(argument_count);   // number of arguments
+    target.empend(arguments.size()); // size of arguments
+    target.extend(move(arguments));
+}
+
 template void Regex<PosixBasicParser>::run_optimization_passes();
 template void Regex<PosixBasicParser>::run_optimization_passes();
 template void Regex<PosixExtendedParser>::run_optimization_passes();
 template void Regex<PosixExtendedParser>::run_optimization_passes();
 template void Regex<ECMA262Parser>::run_optimization_passes();
 template void Regex<ECMA262Parser>::run_optimization_passes();