From b3bf5c4ea84bfbd0ba14fdbc3a86e5d54e053702 Mon Sep 17 00:00:00 2001 From: Shannon Booth Date: Sun, 11 Aug 2024 15:19:47 +1200 Subject: [PATCH] AK: Add BOM handling to String::from_utf8_with_replacement_character --- AK/String.cpp | 5 ++++- AK/String.h | 7 ++++++- Tests/AK/TestString.cpp | 12 +++++++++--- Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp | 4 ++-- 4 files changed, 21 insertions(+), 7 deletions(-) diff --git a/AK/String.cpp b/AK/String.cpp index cb642ec01f7..d14d466950d 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -20,8 +20,11 @@ namespace AK { -String String::from_utf8_with_replacement_character(StringView view) +String String::from_utf8_with_replacement_character(StringView view, WithBOMHandling with_bom_handling) { + if (auto bytes = view.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) + view = view.substring_view(3); + if (Utf8View(view).validate()) return String::from_utf8_without_validation(view.bytes()); diff --git a/AK/String.h b/AK/String.h index 14f8e2101c3..85629d8eb9d 100644 --- a/AK/String.h +++ b/AK/String.h @@ -51,8 +51,13 @@ public: // Creates a new String from a sequence of UTF-8 encoded code points. static ErrorOr from_utf8(StringView); + enum class WithBOMHandling { + Yes, + No, + }; + // Creates a new String using the replacement character for invalid bytes - [[nodiscard]] static String from_utf8_with_replacement_character(StringView); + [[nodiscard]] static String from_utf8_with_replacement_character(StringView, WithBOMHandling = WithBOMHandling::Yes); template requires(IsOneOf, ByteString, DeprecatedFlyString, FlyString, String>) diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index ddb012f4efc..4f03db749b5 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -175,15 +175,21 @@ TEST_CASE(invalid_utf8) TEST_CASE(with_replacement_character) { - auto string1 = String::from_utf8_with_replacement_character("long string \xf4\x8f\xbf\xc0"sv); // U+110000 + auto string1 = String::from_utf8_with_replacement_character("long string \xf4\x8f\xbf\xc0"sv, String::WithBOMHandling::No); // U+110000 Array string1_expected { 0x6c, 0x6f, 0x6e, 0x67, 0x20, 0x73, 0x74, 0x72, 0x69, 0x6e, 0x67, 0x20, 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd }; EXPECT_EQ(string1.bytes(), string1_expected); - auto string3 = String::from_utf8_with_replacement_character("A valid string!"sv); + auto string3 = String::from_utf8_with_replacement_character("A valid string!"sv, String::WithBOMHandling::No); EXPECT_EQ(string3, "A valid string!"sv); - auto string4 = String::from_utf8_with_replacement_character(""sv); + auto string4 = String::from_utf8_with_replacement_character(""sv, String::WithBOMHandling::No); EXPECT_EQ(string4, ""sv); + + auto string5 = String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, String::WithBOMHandling::Yes); + EXPECT_EQ(string5, "WHF!"sv); + + auto string6 = String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, String::WithBOMHandling::No); + EXPECT_EQ(string6, "\xEF\xBB\xBFWHF!"sv); } TEST_CASE(from_code_points) diff --git a/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp b/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp index 62e0863a1f9..a6a2eaa28b1 100644 --- a/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp +++ b/Userland/Libraries/LibWeb/DOMURL/URLSearchParams.cpp @@ -116,8 +116,8 @@ ErrorOr> url_decode(StringView input) auto space_decoded_value = value.replace("+"sv, " "sv, ReplaceMode::All); // 5. Let nameString and valueString be the result of running UTF-8 decode without BOM on the percent-decoding of name and value, respectively. - auto name_string = String::from_utf8_with_replacement_character(URL::percent_decode(space_decoded_name)); - auto value_string = String::from_utf8_with_replacement_character(URL::percent_decode(space_decoded_value)); + auto name_string = String::from_utf8_with_replacement_character(URL::percent_decode(space_decoded_name), String::WithBOMHandling::No); + auto value_string = String::from_utf8_with_replacement_character(URL::percent_decode(space_decoded_value), String::WithBOMHandling::No); TRY(output.try_empend(move(name_string), move(value_string))); }