mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 15:40:19 +00:00
AK: Invalidate overlong UTF-8 code point encodings
For example, the code point U+002F could be encoded as UTF-8 with the bytes 0x80 0xAF. This trick has historically been used to bypass security checks.
This commit is contained in:
parent
796a615bc1
commit
c4d78c29a2
Notes:
sideshowbarker
2024-07-17 10:08:28 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/c4d78c29a2 Pull-request: https://github.com/SerenityOS/serenity/pull/17699 Reviewed-by: https://github.com/linusg ✅
2 changed files with 58 additions and 6 deletions
|
@ -7,7 +7,6 @@
|
||||||
|
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <AK/CharacterTypes.h>
|
|
||||||
#include <AK/DeprecatedString.h>
|
#include <AK/DeprecatedString.h>
|
||||||
#include <AK/Format.h>
|
#include <AK/Format.h>
|
||||||
#include <AK/StringView.h>
|
#include <AK/StringView.h>
|
||||||
|
@ -142,7 +141,7 @@ public:
|
||||||
code_point |= code_point_bits;
|
code_point |= code_point_bits;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!is_unicode(code_point))
|
if (!is_valid_code_point(code_point, byte_length))
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
valid_bytes += byte_length;
|
valid_bytes += byte_length;
|
||||||
|
@ -162,13 +161,15 @@ private:
|
||||||
size_t byte_length { 0 };
|
size_t byte_length { 0 };
|
||||||
u8 encoding_bits { 0 };
|
u8 encoding_bits { 0 };
|
||||||
u8 encoding_mask { 0 };
|
u8 encoding_mask { 0 };
|
||||||
|
u32 first_code_point { 0 };
|
||||||
|
u32 last_code_point { 0 };
|
||||||
};
|
};
|
||||||
|
|
||||||
static constexpr Array<Utf8EncodedByteData, 4> utf8_encoded_byte_data { {
|
static constexpr Array<Utf8EncodedByteData, 4> utf8_encoded_byte_data { {
|
||||||
{ 1, 0b0000'0000, 0b1000'0000 },
|
{ 1, 0b0000'0000, 0b1000'0000, 0x0000, 0x007F },
|
||||||
{ 2, 0b1100'0000, 0b1110'0000 },
|
{ 2, 0b1100'0000, 0b1110'0000, 0x0080, 0x07FF },
|
||||||
{ 3, 0b1110'0000, 0b1111'0000 },
|
{ 3, 0b1110'0000, 0b1111'0000, 0x0800, 0xFFFF },
|
||||||
{ 4, 0b1111'0000, 0b1111'1000 },
|
{ 4, 0b1111'0000, 0b1111'1000, 0x10000, 0x10FFFF },
|
||||||
} };
|
} };
|
||||||
|
|
||||||
struct LeadingByte {
|
struct LeadingByte {
|
||||||
|
@ -208,6 +209,16 @@ private:
|
||||||
return { .is_valid = false };
|
return { .is_valid = false };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length)
|
||||||
|
{
|
||||||
|
for (auto const& data : utf8_encoded_byte_data) {
|
||||||
|
if (code_point >= data.first_code_point && code_point <= data.last_code_point)
|
||||||
|
return byte_length == data.byte_length;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
StringView m_string;
|
StringView m_string;
|
||||||
mutable size_t m_length { 0 };
|
mutable size_t m_length { 0 };
|
||||||
mutable bool m_have_length { false };
|
mutable bool m_have_length { false };
|
||||||
|
|
|
@ -82,6 +82,47 @@ TEST_CASE(validate_invalid_ut8)
|
||||||
EXPECT(valid_bytes == 0);
|
EXPECT(valid_bytes == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE(validate_overlong_utf8)
|
||||||
|
{
|
||||||
|
size_t valid_bytes = 0;
|
||||||
|
|
||||||
|
// Overlong 2-byte encoding of U+002F
|
||||||
|
char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) };
|
||||||
|
Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } };
|
||||||
|
EXPECT(!utf8_1.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
|
// Overlong 3-byte encoding of U+002F
|
||||||
|
char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) };
|
||||||
|
Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } };
|
||||||
|
EXPECT(!utf8_2.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
|
// Overlong 4-byte encoding of U+002F
|
||||||
|
char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) };
|
||||||
|
Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } };
|
||||||
|
EXPECT(!utf8_3.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
|
// Overlong 3-byte encoding of U+00FF
|
||||||
|
char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) };
|
||||||
|
Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } };
|
||||||
|
EXPECT(!utf8_4.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
|
// Overlong 4-byte encoding of U+00FF
|
||||||
|
char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) };
|
||||||
|
Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } };
|
||||||
|
EXPECT(!utf8_5.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
|
||||||
|
// Overlong 4-byte encoding of U+0FFF
|
||||||
|
char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) };
|
||||||
|
Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } };
|
||||||
|
EXPECT(!utf8_6.validate(valid_bytes));
|
||||||
|
EXPECT(valid_bytes == 2);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_CASE(iterate_utf8)
|
TEST_CASE(iterate_utf8)
|
||||||
{
|
{
|
||||||
Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);
|
Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);
|
||||||
|
|
Loading…
Reference in a new issue