LibTextCodec: Ignore BYTE ORDER MARK at the start of utf8/16 strings

Before, this was getting included as part of the output text, which was
confusing the HTML parser. Nobody needs the BOM after we have identified
the codec, so now we remove it when converting to UTF-8.
This commit is contained in:
Sam Atkins 2021-09-15 15:33:30 +01:00 committed by Andreas Kling
parent 9ec02e7137
commit d7ffa51424
Notes: sideshowbarker 2024-07-18 03:55:29 +09:00

View file

@ -208,7 +208,13 @@ void UTF8Decoder::process(const StringView& input, Function<void(u32)> on_code_p
String UTF8Decoder::to_utf8(const StringView& input)
{
return input;
// Discard the BOM
auto bomless_input = input;
if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
bomless_input = input.substring_view(3);
}
return bomless_input;
}
void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
@ -222,8 +228,14 @@ void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_cod
String UTF16BEDecoder::to_utf8(const StringView& input)
{
StringBuilder builder(input.length() / 2);
process(input, [&builder](u32 c) { builder.append_code_point(c); });
// Discard the BOM
auto bomless_input = input;
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
bomless_input = input.substring_view(2);
}
StringBuilder builder(bomless_input.length() / 2);
process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
return builder.to_string();
}