From d7ffa5142498095644055f11aba99bad351221d5 Mon Sep 17 00:00:00 2001 From: Sam Atkins Date: Wed, 15 Sep 2021 15:33:30 +0100 Subject: [PATCH] LibTextCodec: Ignore BYTE ORDER MARK at the start of utf8/16 strings Before, this was getting included as part of the output text, which was confusing the HTML parser. Nobody needs the BOM after we have identified the codec, so now we remove it when converting to UTF-8. --- Userland/Libraries/LibTextCodec/Decoder.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 262489ee7a8..7a1bc03c015 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -208,7 +208,13 @@ void UTF8Decoder::process(const StringView& input, Function on_code_p String UTF8Decoder::to_utf8(const StringView& input) { - return input; + // Discard the BOM + auto bomless_input = input; + if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { + bomless_input = input.substring_view(3); + } + + return bomless_input; } void UTF16BEDecoder::process(const StringView& input, Function on_code_point) @@ -222,8 +228,14 @@ void UTF16BEDecoder::process(const StringView& input, Function on_cod String UTF16BEDecoder::to_utf8(const StringView& input) { - StringBuilder builder(input.length() / 2); - process(input, [&builder](u32 c) { builder.append_code_point(c); }); + // Discard the BOM + auto bomless_input = input; + if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) { + bomless_input = input.substring_view(2); + } + + StringBuilder builder(bomless_input.length() / 2); + process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); }); return builder.to_string(); }