浏览代码

LibRegex: Change bytecode value type to a 64-bit value

To allow storing unicode ranges compactly; this is not utilised at the
moment, but changing this later would've been significantly more
difficult.
Also fixes a few debug logs.
AnotherTest 4 年之前
父节点
当前提交
3db8ced4c7

+ 4 - 4
Libraries/LibRegex/RegexByteCode.cpp

@@ -275,7 +275,7 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const M
         auto& map = output.named_capture_group_matches.at(input.match_index);
 
 #ifdef REGEX_DEBUG
-        ASSERT(start_position + length < input.view.length());
+        ASSERT(start_position + length <= input.view.length());
         dbg() << "Save named capture group with name=" << capture_group_name << " and content: " << input.view.substring_view(start_position, length).to_string();
 #endif
 
@@ -415,7 +415,7 @@ ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, Match
     return false;
 }
 
-ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched)
+ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
 {
     switch (character_class) {
     case CharClass::Alnum:
@@ -513,7 +513,7 @@ ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& inp
     }
 }
 
-ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched)
+ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
 {
     if (input.regex_options & AllFlags::Insensitive) {
         from = tolower(from);
@@ -553,7 +553,7 @@ const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<Match
         } else if (compare_type == CharacterCompareType::String) {
             char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
             auto& length = m_bytecode->at(offset++);
-            result.empend(String::format("value=\"%s\"", String { str, length }.characters()));
+            result.empend(String::format("value=\"%.*s\"", length, str));
             if (!view.is_null())
                 result.empend(String::format("compare against: \"%s\"", input.value().view.substring_view(state().string_position, state().string_position + length > view.length() ? 0 : length).to_string().characters()));
         } else if (compare_type == CharacterCompareType::CharClass) {

+ 10 - 10
Libraries/LibRegex/RegexByteCode.h

@@ -39,7 +39,7 @@
 
 namespace regex {
 
-using ByteCodeValueType = size_t;
+using ByteCodeValueType = u64;
 
 #define ENUMERATE_OPCODES                          \
     __ENUMERATE_OPCODE(Compare)                    \
@@ -102,22 +102,22 @@ enum class CharClass : ByteCodeValueType {
 };
 
 struct CharRange {
-    const char from;
-    const char to;
+    const u32 from;
+    const u32 to;
 
-    CharRange(size_t value)
-        : from(value >> 8)
-        , to(value & 0xFF)
+    CharRange(u64 value)
+        : from(value >> 32)
+        , to(value & 0xffffffff)
     {
     }
 
-    CharRange(char from, char to)
+    CharRange(u32 from, u32 to)
         : from(from)
         , to(to)
     {
     }
 
-    operator ByteCodeValueType() const { return (from << 8) | to; }
+    operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
 };
 
 struct CompareTypeAndValuePair {
@@ -568,8 +568,8 @@ public:
 private:
     ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length);
-    ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched);
-    ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched);
+    ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
+    ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
 };
 
 template<typename T>

+ 2 - 2
Libraries/LibRegex/RegexMatcher.cpp

@@ -131,7 +131,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
     for (auto& view : views) {
         input.view = view;
 #ifdef REGEX_DEBUG
-        dbg() << "[match] Starting match with view (" << view.length() << "): _" << view << "_";
+        dbg() << "[match] Starting match with view (" << view.length() << "): _" << view.to_string() << "_";
 #endif
 
         auto view_length = view.length();
@@ -171,7 +171,7 @@ RegexResult Matcher<Parser>::match(const Vector<RegexStringView> views, Optional
 
 #ifdef REGEX_DEBUG
                 dbg() << "state.string_position: " << state.string_position << " view_index: " << view_index;
-                dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index);
+                dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index).to_string();
 #endif
                 ++match_count;
 

+ 1 - 1
Libraries/LibRegex/RegexParser.cpp

@@ -365,7 +365,7 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack
             values.take_last(); // RangeExpressionDummy
             auto value1 = values.take_last();
 
-            values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (char)value1.value, (char)value2.value }) });
+            values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
         }
     }
 

+ 2 - 3
Libraries/LibRegex/Tests/CMakeLists.txt

@@ -1,10 +1,9 @@
 file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp")
-file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp")
-file(GLOB C_REGEX_SOURCES CONFIGURE_DEPENDS "../C/*.cpp")
+file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp" "../C/*.cpp")
 
 foreach(source ${TEST_SOURCES})
     get_filename_component(name ${source} NAME_WE)
-    add_executable(${name} ${source} ${REGEX_SOURCES} ${C_REGEX_SOURCES})
+    add_executable(${name} ${source} ${REGEX_SOURCES})
     target_link_libraries(${name} LagomCore)
     add_test(
         NAME ${name}