mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 00:50:22 +00:00
AK: Decode paired UTF-16 surrogates in a JSON string
For example, such use is seen on Twitter.
This commit is contained in:
parent
c39a3fef17
commit
698a95d2de
Notes:
sideshowbarker
2024-07-17 08:34:29 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/698a95d2de Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/369
2 changed files with 33 additions and 13 deletions
|
@ -105,20 +105,18 @@ ErrorOr<ByteString> JsonParser::consume_and_unescape_string()
|
|||
case 'u': {
|
||||
ignore(); // 'u'
|
||||
|
||||
if (tell_remaining() < 4)
|
||||
return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
|
||||
auto escaped_string = consume(4);
|
||||
auto code_point = AK::StringUtils::convert_to_uint_from_hex(escaped_string);
|
||||
if (!code_point.has_value()) {
|
||||
dbgln("JsonParser: Error while parsing Unicode escape {}", escaped_string);
|
||||
// https://ecma-international.org/wp-content/uploads/ECMA-404_2nd_edition_december_2017.pdf
|
||||
//
|
||||
// To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
|
||||
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
|
||||
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
|
||||
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
|
||||
// explicit surrogate pair is a semantic decision that is determined by the specific processor.
|
||||
auto code_point = decode_single_or_paired_surrogate();
|
||||
|
||||
if (code_point.is_error())
|
||||
return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
|
||||
}
|
||||
// Note/FIXME: "To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
|
||||
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
|
||||
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
|
||||
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
|
||||
// explicit surrogate pair is a semantic decision that is determined by the specific processor."
|
||||
// ~ECMA-404, 2nd Edition Dec. 2017, page 5
|
||||
|
||||
final_sb.append_code_point(code_point.value());
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -79,6 +79,28 @@ TEST_CASE(json_utf8_character)
|
|||
EXPECT_EQ(json.as_string() == "A", true);
|
||||
}
|
||||
|
||||
TEST_CASE(json_encoded_surrogates)
|
||||
{
|
||||
{
|
||||
auto json = JsonValue::from_string("\"\\uD83E\\uDD13\""sv).value();
|
||||
EXPECT_EQ(json.type(), JsonValue::Type::String);
|
||||
EXPECT_EQ(json.as_string().length(), 4u);
|
||||
EXPECT_EQ(json.as_string(), "🤓"sv);
|
||||
}
|
||||
{
|
||||
auto json = JsonValue::from_string("\"\\uD83E\""sv).value();
|
||||
EXPECT_EQ(json.type(), JsonValue::Type::String);
|
||||
EXPECT_EQ(json.as_string().length(), 3u);
|
||||
EXPECT_EQ(json.as_string(), "\xED\xA0\xBE"sv);
|
||||
}
|
||||
{
|
||||
auto json = JsonValue::from_string("\"\\uDD13\""sv).value();
|
||||
EXPECT_EQ(json.type(), JsonValue::Type::String);
|
||||
EXPECT_EQ(json.as_string().length(), 3u);
|
||||
EXPECT_EQ(json.as_string(), "\xED\xB4\x93"sv);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
FIXME: Parse JSON from a Utf8View
|
||||
|
||||
|
|
Loading…
Reference in a new issue