AK: Decode paired UTF-16 surrogates in a JSON string

For example, such use is seen on Twitter.
This commit is contained in:
Timothy Flynn 2024-07-02 13:20:10 -04:00 committed by Andreas Kling
parent c39a3fef17
commit 698a95d2de
Notes: sideshowbarker 2024-07-17 08:34:29 +09:00
2 changed files with 33 additions and 13 deletions

View file

@ -105,20 +105,18 @@ ErrorOr<ByteString> JsonParser::consume_and_unescape_string()
case 'u': {
ignore(); // 'u'
if (tell_remaining() < 4)
return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
auto escaped_string = consume(4);
auto code_point = AK::StringUtils::convert_to_uint_from_hex(escaped_string);
if (!code_point.has_value()) {
dbgln("JsonParser: Error while parsing Unicode escape {}", escaped_string);
// https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf
//
// To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
// explicit surrogate pair is a semantic decision that is determined by the specific processor.
auto code_point = decode_single_or_paired_surrogate();
if (code_point.is_error())
return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
}
// Note/FIXME: "To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
// explicit surrogate pair is a semantic decision that is determined by the specific processor."
// ~ECMA-404, 2nd Edition Dec. 2017, page 5
final_sb.append_code_point(code_point.value());
break;
}

View file

@ -79,6 +79,28 @@ TEST_CASE(json_utf8_character)
EXPECT_EQ(json.as_string() == "A", true);
}
TEST_CASE(json_encoded_surrogates)
{
{
auto json = JsonValue::from_string("\"\\uD83E\\uDD13\""sv).value();
EXPECT_EQ(json.type(), JsonValue::Type::String);
EXPECT_EQ(json.as_string().length(), 4u);
EXPECT_EQ(json.as_string(), "🤓"sv);
}
{
auto json = JsonValue::from_string("\"\\uD83E\""sv).value();
EXPECT_EQ(json.type(), JsonValue::Type::String);
EXPECT_EQ(json.as_string().length(), 3u);
EXPECT_EQ(json.as_string(), "\xED\xA0\xBE"sv);
}
{
auto json = JsonValue::from_string("\"\\uDD13\""sv).value();
EXPECT_EQ(json.type(), JsonValue::Type::String);
EXPECT_EQ(json.as_string().length(), 3u);
EXPECT_EQ(json.as_string(), "\xED\xB4\x93"sv);
}
}
/*
FIXME: Parse JSON from a Utf8View