Преглед на файлове

AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:

    /ô¡°½/

The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.

This commit adds additional validation to ensure the decoded code point
itself is also valid.
Timothy Flynn преди 3 години
родител
ревизия
9e5abec6f1
променени са 2 файла, в които са добавени 20 реда и са изтрити 3 реда
  1. 10 3
      AK/Utf8View.cpp
  2. 10 0
      Tests/AK/TestUtf8.cpp

+ 10 - 3
AK/Utf8View.cpp

@@ -6,6 +6,7 @@
  */
 
 #include <AK/Assertions.h>
+#include <AK/CharacterTypes.h>
 #include <AK/Format.h>
 #include <AK/Utf8View.h>
 
@@ -100,9 +101,9 @@ bool Utf8View::validate(size_t& valid_bytes) const
 {
     valid_bytes = 0;
     for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
-        size_t code_point_length_in_bytes;
-        u32 value;
-        bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, value);
+        size_t code_point_length_in_bytes = 0;
+        u32 code_point = 0;
+        bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, code_point);
         if (!first_byte_makes_sense)
             return false;
 
@@ -112,8 +113,14 @@ bool Utf8View::validate(size_t& valid_bytes) const
                 return false;
             if (*ptr >> 6 != 2)
                 return false;
+
+            code_point <<= 6;
+            code_point |= *ptr & 63;
         }
 
+        if (!is_unicode(code_point))
+            return false;
+
         valid_bytes += code_point_length_in_bytes;
     }
 

+ 10 - 0
Tests/AK/TestUtf8.cpp

@@ -70,6 +70,16 @@ TEST_CASE(validate_invalid_ut8)
     Utf8View utf8_4 { StringView { invalid_utf8_4 } };
     EXPECT(!utf8_4.validate(valid_bytes));
     EXPECT(valid_bytes == 0);
+
+    char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0, 0 }; // U+110000
+    Utf8View utf8_5 { StringView { invalid_utf8_5 } };
+    EXPECT(!utf8_5.validate(valid_bytes));
+    EXPECT(valid_bytes == 0);
+
+    char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd, 0 }; // U+121c3d
+    Utf8View utf8_6 { StringView { invalid_utf8_6 } };
+    EXPECT(!utf8_6.validate(valid_bytes));
+    EXPECT(valid_bytes == 0);
 }
 
 TEST_CASE(iterate_utf8)