LibTextCodec: Add BOM sniffer

This takes the input and sniffs it for a BOM. If it has the UTF-8 or
UTF-16BE BOM, it will return their respective decoder. Currently we
don't have a UTF-16LE decoder, so it will assert TODO if it detects
a UTF-16LE BOM. If there is no recognisable BOM, it will return no
decoder.
This commit is contained in:
Luke Wilde 2022-02-11 20:58:06 +00:00 committed by Andreas Kling
parent 4ccade42b7
commit 94965ba28d
Notes: sideshowbarker 2024-07-17 18:58:31 +09:00
2 changed files with 38 additions and 0 deletions

View file

@ -141,6 +141,41 @@ Optional<String> get_standardized_encoding(const String& encoding)
return {};
}
// https://encoding.spec.whatwg.org/#bom-sniff
Decoder* bom_sniff_to_decoder(StringView input)
{
// 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
// 2. For each of the rows in the table below, starting with the first one and going down,
// if BOM starts with the bytes given in the first column, then return the encoding given
// in the cell in the second column of that row. Otherwise, return null.
// Byte Order Mark | Encoding
// --------------------------
// 0xEF 0xBB 0xBF | UTF-8
// 0xFE 0xFF | UTF-16BE
// 0xFF 0xFE | UTF-16LE
auto bytes = input.bytes();
if (bytes.size() < 2)
return nullptr;
auto first_byte = bytes[0];
switch (first_byte) {
case 0xEF: // UTF-8
if (bytes.size() < 3)
return nullptr;
return bytes[1] == 0xBB && bytes[2] == 0xBF ? &s_utf8_decoder : nullptr;
case 0xFE: // UTF-16BE
return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
case 0xFF: // UTF-16LE
// FIXME: There is currently no UTF-16LE decoder.
TODO();
}
return nullptr;
}
String Decoder::to_utf8(StringView input)
{
StringBuilder builder(input.length());

View file

@ -70,4 +70,7 @@ public:
Decoder* decoder_for(String const& encoding);
Optional<String> get_standardized_encoding(const String& encoding);
// This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
Decoder* bom_sniff_to_decoder(StringView);
}