LibTextCodec: Ignore BYTE ORDER MARK at the start of utf8/16 strings

Before, this was getting included as part of the output text, which was confusing the HTML parser. Nobody needs the BOM after we have identified the codec, so now we remove it when converting to UTF-8.
Author: https://github.com/AtkinsSJ Commit: https://github.com/SerenityOS/serenity/commit/d7ffa514249 Pull-request: https://github.com/SerenityOS/serenity/pull/10040
2024-12-11 17:00:37 +00:00 · 2021-09-15 15:33:30 +01:00 · 2021-09-15 15:33:30 +01:00 · d7ffa51424 · 2024-07-18 03:55:29 +09:00
commit d7ffa51424
parent 9ec02e7137
1 changed files with 15 additions and 3 deletions
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@ -208,7 +208,13 @@ void UTF8Decoder::process(const StringView& input, Function<void(u32)> on_code_p

 String UTF8Decoder::to_utf8(const StringView& input)
 {
-    return input;
+    // Discard the BOM
+    auto bomless_input = input;
+    if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
+        bomless_input = input.substring_view(3);
+    }
+
+    return bomless_input;
 }

 void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
@ -222,8 +228,14 @@ void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_cod

 String UTF16BEDecoder::to_utf8(const StringView& input)
 {
-    StringBuilder builder(input.length() / 2);
-    process(input, [&builder](u32 c) { builder.append_code_point(c); });
+    // Discard the BOM
+    auto bomless_input = input;
+    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
+        bomless_input = input.substring_view(2);
+    }
+
+    StringBuilder builder(bomless_input.length() / 2);
+    process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
    return builder.to_string();
 }