Pārlūkot izejas kodu

LibJS: Implement RegExp.prototype [ @@match ] with UTF-16 code units

Timothy Flynn 4 gadi atpakaļ
vecāks
revīzija
2c023157e9

+ 16 - 6
Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp

@@ -112,7 +112,7 @@ size_t advance_string_index(String const& string, size_t index, bool unicode)
     return advance_string_index(utf16_string_view, index, unicode);
 }
 
-static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
+static void increment_last_index(GlobalObject& global_object, Object& regexp_object, Utf16View const& string, bool unicode)
 {
     auto& vm = global_object.vm();
 
@@ -127,6 +127,14 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj
     regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes);
 }
 
+static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
+{
+    auto utf16_string = AK::utf8_to_utf16(string);
+    Utf16View utf16_string_view { utf16_string };
+
+    return increment_last_index(global_object, regexp_object, utf16_string_view, unicode);
+}
+
 // 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records
 struct Match {
     static Match create(regex::Match const& match)
@@ -485,9 +493,11 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
     auto* regexp_object = this_object_from(vm, global_object);
     if (!regexp_object)
         return {};
-    auto s = vm.argument(0).to_string(global_object);
+
+    auto string = vm.argument(0).to_utf16_string(global_object);
     if (vm.exception())
         return {};
+    Utf16View string_view { string };
 
     auto global_value = regexp_object->get(vm.names.global);
     if (vm.exception())
@@ -495,7 +505,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
     bool global = global_value.to_boolean();
 
     if (!global) {
-        auto result = regexp_exec(global_object, *regexp_object, s);
+        auto result = regexp_exec(global_object, *regexp_object, string_view);
         if (vm.exception())
             return {};
         return result;
@@ -517,7 +527,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
     size_t n = 0;
 
     while (true) {
-        auto result = regexp_exec(global_object, *regexp_object, s);
+        auto result = regexp_exec(global_object, *regexp_object, string_view);
         if (vm.exception())
             return {};
 
@@ -542,7 +552,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match)
             return {};
 
         if (match_str.is_empty()) {
-            increment_last_index(global_object, *regexp_object, s, unicode);
+            increment_last_index(global_object, *regexp_object, string_view, unicode);
             if (vm.exception())
                 return {};
         }
@@ -558,7 +568,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_match_all)
     if (!regexp_object)
         return {};
 
-    auto string = vm.argument(0).to_string(global_object);
+    auto string = vm.argument(0).to_utf16_string(global_object);
     if (vm.exception())
         return {};
 

+ 2 - 2
Userland/Libraries/LibJS/Runtime/RegExpStringIterator.cpp

@@ -10,12 +10,12 @@
 namespace JS {
 
 // 22.2.7.1 CreateRegExpStringIterator ( R, S, global, fullUnicode ), https://tc39.es/ecma262/#sec-createregexpstringiterator
-RegExpStringIterator* RegExpStringIterator::create(GlobalObject& global_object, Object& regexp_object, String string, bool global, bool unicode)
+RegExpStringIterator* RegExpStringIterator::create(GlobalObject& global_object, Object& regexp_object, Vector<u16> string, bool global, bool unicode)
 {
     return global_object.heap().allocate<RegExpStringIterator>(global_object, *global_object.regexp_string_iterator_prototype(), regexp_object, move(string), global, unicode);
 }
 
-RegExpStringIterator::RegExpStringIterator(Object& prototype, Object& regexp_object, String string, bool global, bool unicode)
+RegExpStringIterator::RegExpStringIterator(Object& prototype, Object& regexp_object, Vector<u16> string, bool global, bool unicode)
     : Object(prototype)
     , m_regexp_object(regexp_object)
     , m_string(move(string))

+ 5 - 4
Userland/Libraries/LibJS/Runtime/RegExpStringIterator.h

@@ -6,6 +6,7 @@
 
 #pragma once
 
+#include <AK/Utf16View.h>
 #include <LibJS/Runtime/Object.h>
 
 namespace JS {
@@ -14,13 +15,13 @@ class RegExpStringIterator final : public Object {
     JS_OBJECT(RegExpStringIterator, Object);
 
 public:
-    static RegExpStringIterator* create(GlobalObject&, Object& regexp_object, String string, bool global, bool unicode);
+    static RegExpStringIterator* create(GlobalObject&, Object& regexp_object, Vector<u16> string, bool global, bool unicode);
 
-    explicit RegExpStringIterator(Object& prototype, Object& regexp_object, String string, bool global, bool unicode);
+    explicit RegExpStringIterator(Object& prototype, Object& regexp_object, Vector<u16> string, bool global, bool unicode);
     virtual ~RegExpStringIterator() override = default;
 
     Object& regexp_object() { return m_regexp_object; }
-    String const& string() const { return m_string; }
+    Utf16View string() const { return Utf16View { m_string }; }
     bool global() const { return m_global; }
     bool unicode() const { return m_unicode; }
 
@@ -31,7 +32,7 @@ private:
     virtual void visit_edges(Cell::Visitor&) override;
 
     Object& m_regexp_object;
-    String m_string;
+    Vector<u16> m_string;
     bool m_global { false };
     bool m_unicode { false };
     bool m_done { false };

+ 10 - 4
Userland/Libraries/LibJS/Runtime/StringPrototype.cpp

@@ -839,13 +839,16 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::match)
         if (vm.exception())
             return {};
     }
-    auto s = this_object.to_string(global_object);
+
+    auto string = this_object.to_utf16_string(global_object);
     if (vm.exception())
         return {};
+    Utf16View utf16_string_view { string };
+
     auto rx = regexp_create(global_object, regexp, js_undefined());
     if (!rx)
         return {};
-    return rx->invoke(*vm.well_known_symbol_match(), js_string(vm, s));
+    return rx->invoke(*vm.well_known_symbol_match(), js_string(vm, utf16_string_view));
 }
 
 // 22.1.3.12 String.prototype.matchAll ( regexp ), https://tc39.es/ecma262/#sec-string.prototype.matchall
@@ -879,13 +882,16 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::match_all)
         if (vm.exception())
             return {};
     }
-    auto s = this_object.to_string(global_object);
+
+    auto string = this_object.to_utf16_string(global_object);
     if (vm.exception())
         return {};
+    Utf16View utf16_string_view { string };
+
     auto rx = regexp_create(global_object, regexp, js_string(vm, "g"));
     if (!rx)
         return {};
-    return rx->invoke(*vm.well_known_symbol_match_all(), js_string(vm, s));
+    return rx->invoke(*vm.well_known_symbol_match_all(), js_string(vm, utf16_string_view));
 }
 
 // 22.1.3.17 String.prototype.replace ( searchValue, replaceValue ), https://tc39.es/ecma262/#sec-string.prototype.replace

+ 10 - 0
Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.match.js

@@ -45,3 +45,13 @@ test("override exec with non-function", () => {
     re.exec = 3;
     expect("test".match(re)).not.toBeNull();
 });
+
+test("UTF-16", () => {
+    expect("😀".match("foo")).toBeNull();
+    expect("😀".match("\ud83d")).toEqual(["\ud83d"]);
+    expect("😀".match("\ude00")).toEqual(["\ude00"]);
+    expect("😀😀".match("\ud83d")).toEqual(["\ud83d"]);
+    expect("😀😀".match("\ude00")).toEqual(["\ude00"]);
+    expect("😀😀".match(/\ud83d/g)).toEqual(["\ud83d", "\ud83d"]);
+    expect("😀😀".match(/\ude00/g)).toEqual(["\ude00", "\ude00"]);
+});

+ 60 - 0
Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.matchAll.js

@@ -76,3 +76,63 @@ test("basic functionality", () => {
         expect(next.value).toBeUndefined();
     }
 });
+
+test("UTF-16", () => {
+    {
+        var iterator = "😀".matchAll("foo");
+
+        var next = iterator.next();
+        expect(next.done).toBeTrue();
+        expect(next.value).toBeUndefined();
+
+        next = iterator.next();
+        expect(next.done).toBeTrue();
+        expect(next.value).toBeUndefined();
+    }
+    {
+        var iterator = "😀".matchAll("\ud83d");
+
+        var next = iterator.next();
+        expect(next.done).toBeFalse();
+        expect(next.value).toEqual(["\ud83d"]);
+        expect(next.value.index).toBe(0);
+
+        next = iterator.next();
+        expect(next.done).toBeTrue();
+        expect(next.value).toBeUndefined();
+    }
+    {
+        var iterator = "😀😀".matchAll("\ud83d");
+
+        var next = iterator.next();
+        expect(next.done).toBeFalse();
+        expect(next.value).toEqual(["\ud83d"]);
+        expect(next.value.index).toBe(0);
+
+        next = iterator.next();
+        expect(next.done).toBeFalse();
+        expect(next.value).toEqual(["\ud83d"]);
+        expect(next.value.index).toBe(2);
+
+        next = iterator.next();
+        expect(next.done).toBeTrue();
+        expect(next.value).toBeUndefined();
+    }
+    {
+        var iterator = "😀😀".matchAll("\ude00");
+
+        var next = iterator.next();
+        expect(next.done).toBeFalse();
+        expect(next.value).toEqual(["\ude00"]);
+        expect(next.value.index).toBe(1);
+
+        next = iterator.next();
+        expect(next.done).toBeFalse();
+        expect(next.value).toEqual(["\ude00"]);
+        expect(next.value.index).toBe(3);
+
+        next = iterator.next();
+        expect(next.done).toBeTrue();
+        expect(next.value).toBeUndefined();
+    }
+});