mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-25 09:00:22 +00:00
AK: Explicitly check for null data in Utf16View
The underlying CPU-specific instructions for operating on UTF-16 strings behave differently for null inputs. Add an explicit check for this state for consistency.
This commit is contained in:
parent
144452d638
commit
74d644a216
Notes:
github-actions[bot]
2024-07-21 17:58:00 +00:00
Author: https://github.com/trflynn89 Commit: https://github.com/LadybirdBrowser/ladybird/commit/74d644a2168 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/734 Reviewed-by: https://github.com/jamierocks ✅ Reviewed-by: https://github.com/tcl3 ✅
3 changed files with 34 additions and 0 deletions
|
@ -49,6 +49,8 @@ ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
|||
{
|
||||
if (!utf16.validate())
|
||||
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
|
||||
if (utf16.is_empty())
|
||||
return String {};
|
||||
|
||||
String result;
|
||||
|
||||
|
|
|
@ -60,6 +60,8 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
|
|||
// All callers want to allow lonely surrogates, which simdutf does not permit.
|
||||
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
|
||||
return to_utf16_slow(utf8_view, endianness);
|
||||
if (utf8_view.is_empty())
|
||||
return Utf16Data {};
|
||||
|
||||
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
|
||||
auto length = utf8_view.byte_length();
|
||||
|
@ -85,6 +87,9 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
|
|||
|
||||
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
|
||||
{
|
||||
if (utf32_view.is_empty())
|
||||
return Utf16Data {};
|
||||
|
||||
auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
|
||||
auto length = utf32_view.length();
|
||||
|
||||
|
@ -288,6 +293,10 @@ bool Utf16View::starts_with(Utf16View const& needle) const
|
|||
|
||||
bool Utf16View::validate() const
|
||||
{
|
||||
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
|
||||
if (is_empty())
|
||||
return true;
|
||||
|
||||
switch (m_endianness) {
|
||||
case Endianness::Host:
|
||||
return simdutf::validate_utf16(char_data(), length_in_code_units());
|
||||
|
@ -301,6 +310,12 @@ bool Utf16View::validate() const
|
|||
|
||||
bool Utf16View::validate(size_t& valid_code_units) const
|
||||
{
|
||||
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
|
||||
if (is_empty()) {
|
||||
valid_code_units = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
auto result = [&]() {
|
||||
switch (m_endianness) {
|
||||
case Endianness::Host:
|
||||
|
@ -319,6 +334,10 @@ bool Utf16View::validate(size_t& valid_code_units) const
|
|||
|
||||
size_t Utf16View::calculate_length_in_code_points() const
|
||||
{
|
||||
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
|
||||
if (is_empty())
|
||||
return 0;
|
||||
|
||||
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
|
||||
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
|
||||
// remove this branch.
|
||||
|
|
|
@ -89,6 +89,19 @@ TEST_CASE(decode_utf16)
|
|||
EXPECT_EQ(i, expected.size());
|
||||
}
|
||||
|
||||
TEST_CASE(null_view)
|
||||
{
|
||||
Utf16View view;
|
||||
EXPECT(view.validate());
|
||||
EXPECT_EQ(view.length_in_code_units(), 0zu);
|
||||
EXPECT_EQ(view.length_in_code_points(), 0zu);
|
||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), ""sv);
|
||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), ""sv);
|
||||
|
||||
for ([[maybe_unused]] auto it : view)
|
||||
FAIL("Iterating a null UTF-16 string should not produce any values");
|
||||
}
|
||||
|
||||
TEST_CASE(utf16_literal)
|
||||
{
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue