Browse Source

LibTextCodec: Add BOM sniffer

This takes the input and sniffs it for a BOM. If it has the UTF-8 or
UTF-16BE BOM, it will return their respective decoder. Currently we
don't have a UTF-16LE decoder, so it will assert TODO if it detects
a UTF-16LE BOM. If there is no recognisable BOM, it will return no
decoder.
Luke Wilde 3 years ago
parent
commit
94965ba28d

+ 35 - 0
Userland/Libraries/LibTextCodec/Decoder.cpp

@@ -141,6 +141,41 @@ Optional<String> get_standardized_encoding(const String& encoding)
     return {};
 }
 
+// https://encoding.spec.whatwg.org/#bom-sniff
+Decoder* bom_sniff_to_decoder(StringView input)
+{
+    // 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
+    // 2. For each of the rows in the table below, starting with the first one and going down,
+    //    if BOM starts with the bytes given in the first column, then return the encoding given
+    //    in the cell in the second column of that row. Otherwise, return null.
+
+    // Byte Order Mark | Encoding
+    // --------------------------
+    // 0xEF 0xBB 0xBF  | UTF-8
+    // 0xFE 0xFF       | UTF-16BE
+    // 0xFF 0xFE       | UTF-16LE
+
+    auto bytes = input.bytes();
+    if (bytes.size() < 2)
+        return nullptr;
+
+    auto first_byte = bytes[0];
+
+    switch (first_byte) {
+    case 0xEF: // UTF-8
+        if (bytes.size() < 3)
+            return nullptr;
+        return bytes[1] == 0xBB && bytes[2] == 0xBF ? &s_utf8_decoder : nullptr;
+    case 0xFE: // UTF-16BE
+        return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
+    case 0xFF: // UTF-16LE
+        // FIXME: There is currently no UTF-16LE decoder.
+        TODO();
+    }
+
+    return nullptr;
+}
+
 String Decoder::to_utf8(StringView input)
 {
     StringBuilder builder(input.length());

+ 3 - 0
Userland/Libraries/LibTextCodec/Decoder.h

@@ -70,4 +70,7 @@ public:
 Decoder* decoder_for(String const& encoding);
 Optional<String> get_standardized_encoding(const String& encoding);
 
+// This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
+Decoder* bom_sniff_to_decoder(StringView);
+
 }