AK: Decode paired UTF-16 surrogates in a JSON string

For example, such use is seen on Twitter.
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/698a95d2de Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/369
2024-11-25 00:50:22 +00:00 · 2024-07-02 13:20:10 -04:00 · 2024-07-02 13:20:10 -04:00 · 698a95d2de · 2024-07-17 08:34:29 +09:00
commit 698a95d2de
parent c39a3fef17
2 changed files with 33 additions and 13 deletions
--- a/AK/JsonParser.cpp
+++ b/AK/JsonParser.cpp
@ -105,20 +105,18 @@ ErrorOr<ByteString> JsonParser::consume_and_unescape_string()
        case 'u': {
            ignore(); // 'u'

-            if (tell_remaining() < 4)
-                return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
-            auto escaped_string = consume(4);
-            auto code_point = AK::StringUtils::convert_to_uint_from_hex(escaped_string);
-            if (!code_point.has_value()) {
-                dbgln("JsonParser: Error while parsing Unicode escape {}", escaped_string);
+            // https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf
+            //
+            // To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
+            // twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
+            // example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
+            // However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
+            // explicit surrogate pair is a semantic decision that is determined by the specific processor.
+            auto code_point = decode_single_or_paired_surrogate();
+
+            if (code_point.is_error())
                return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
-            }
-            // Note/FIXME: "To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
-            //              twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
-            //              example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
-            //              However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
-            //              explicit surrogate pair is a semantic decision that is determined by the specific processor."
-            //             ~ECMA-404, 2nd Edition Dec. 2017, page 5
+
            final_sb.append_code_point(code_point.value());
            break;
        }
--- a/Tests/AK/TestJSON.cpp
+++ b/Tests/AK/TestJSON.cpp
@ -79,6 +79,28 @@ TEST_CASE(json_utf8_character)
    EXPECT_EQ(json.as_string() == "A", true);
 }

+TEST_CASE(json_encoded_surrogates)
+{
+    {
+        auto json = JsonValue::from_string("\"\\uD83E\\uDD13\""sv).value();
+        EXPECT_EQ(json.type(), JsonValue::Type::String);
+        EXPECT_EQ(json.as_string().length(), 4u);
+        EXPECT_EQ(json.as_string(), "🤓"sv);
+    }
+    {
+        auto json = JsonValue::from_string("\"\\uD83E\""sv).value();
+        EXPECT_EQ(json.type(), JsonValue::Type::String);
+        EXPECT_EQ(json.as_string().length(), 3u);
+        EXPECT_EQ(json.as_string(), "\xED\xA0\xBE"sv);
+    }
+    {
+        auto json = JsonValue::from_string("\"\\uDD13\""sv).value();
+        EXPECT_EQ(json.type(), JsonValue::Type::String);
+        EXPECT_EQ(json.as_string().length(), 3u);
+        EXPECT_EQ(json.as_string(), "\xED\xB4\x93"sv);
+    }
+}
+
 /*
 FIXME: Parse JSON from a Utf8View