mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-04 05:20:30 +00:00
LibTextCodec: Add BOM sniffer
This takes the input and sniffs it for a BOM. If it has the UTF-8 or UTF-16BE BOM, it will return their respective decoder. Currently we don't have a UTF-16LE decoder, so it will assert TODO if it detects a UTF-16LE BOM. If there is no recognisable BOM, it will return no decoder.
This commit is contained in:
parent
4ccade42b7
commit
94965ba28d
Notes:
sideshowbarker
2024-07-17 18:58:31 +09:00
Author: https://github.com/Lubrsi Commit: https://github.com/SerenityOS/serenity/commit/94965ba28d Pull-request: https://github.com/SerenityOS/serenity/pull/12448
2 changed files with 38 additions and 0 deletions
|
@ -141,6 +141,41 @@ Optional<String> get_standardized_encoding(const String& encoding)
|
|||
return {};
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#bom-sniff
|
||||
Decoder* bom_sniff_to_decoder(StringView input)
|
||||
{
|
||||
// 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
|
||||
// 2. For each of the rows in the table below, starting with the first one and going down,
|
||||
// if BOM starts with the bytes given in the first column, then return the encoding given
|
||||
// in the cell in the second column of that row. Otherwise, return null.
|
||||
|
||||
// Byte Order Mark | Encoding
|
||||
// --------------------------
|
||||
// 0xEF 0xBB 0xBF | UTF-8
|
||||
// 0xFE 0xFF | UTF-16BE
|
||||
// 0xFF 0xFE | UTF-16LE
|
||||
|
||||
auto bytes = input.bytes();
|
||||
if (bytes.size() < 2)
|
||||
return nullptr;
|
||||
|
||||
auto first_byte = bytes[0];
|
||||
|
||||
switch (first_byte) {
|
||||
case 0xEF: // UTF-8
|
||||
if (bytes.size() < 3)
|
||||
return nullptr;
|
||||
return bytes[1] == 0xBB && bytes[2] == 0xBF ? &s_utf8_decoder : nullptr;
|
||||
case 0xFE: // UTF-16BE
|
||||
return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
|
||||
case 0xFF: // UTF-16LE
|
||||
// FIXME: There is currently no UTF-16LE decoder.
|
||||
TODO();
|
||||
}
|
||||
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
String Decoder::to_utf8(StringView input)
|
||||
{
|
||||
StringBuilder builder(input.length());
|
||||
|
|
|
@ -70,4 +70,7 @@ public:
|
|||
Decoder* decoder_for(String const& encoding);
|
||||
Optional<String> get_standardized_encoding(const String& encoding);
|
||||
|
||||
// This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
|
||||
Decoder* bom_sniff_to_decoder(StringView);
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue