AK: Explicitly check for null data in Utf16View

The underlying CPU-specific instructions for operating on UTF-16 strings
behave differently for null inputs. Add an explicit check for this state
for consistency.
This commit is contained in:
Timothy Flynn 2024-07-20 07:31:19 -04:00 committed by Andreas Kling
parent 144452d638
commit 74d644a216
Notes: github-actions[bot] 2024-07-21 17:58:00 +00:00
3 changed files with 34 additions and 0 deletions

View file

@ -49,6 +49,8 @@ ErrorOr<String> String::from_utf16(Utf16View const& utf16)
{
if (!utf16.validate())
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
if (utf16.is_empty())
return String {};
String result;

View file

@ -60,6 +60,8 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
// All callers want to allow lonely surrogates, which simdutf does not permit.
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
return to_utf16_slow(utf8_view, endianness);
if (utf8_view.is_empty())
return Utf16Data {};
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
auto length = utf8_view.byte_length();
@ -85,6 +87,9 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
{
if (utf32_view.is_empty())
return Utf16Data {};
auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
auto length = utf32_view.length();
@ -288,6 +293,10 @@ bool Utf16View::starts_with(Utf16View const& needle) const
bool Utf16View::validate() const
{
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
if (is_empty())
return true;
switch (m_endianness) {
case Endianness::Host:
return simdutf::validate_utf16(char_data(), length_in_code_units());
@ -301,6 +310,12 @@ bool Utf16View::validate() const
bool Utf16View::validate(size_t& valid_code_units) const
{
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
if (is_empty()) {
valid_code_units = 0;
return true;
}
auto result = [&]() {
switch (m_endianness) {
case Endianness::Host:
@ -319,6 +334,10 @@ bool Utf16View::validate(size_t& valid_code_units) const
size_t Utf16View::calculate_length_in_code_points() const
{
// FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string.
if (is_empty())
return 0;
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
// remove this branch.

View file

@ -89,6 +89,19 @@ TEST_CASE(decode_utf16)
EXPECT_EQ(i, expected.size());
}
TEST_CASE(null_view)
{
Utf16View view;
EXPECT(view.validate());
EXPECT_EQ(view.length_in_code_units(), 0zu);
EXPECT_EQ(view.length_in_code_points(), 0zu);
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), ""sv);
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), ""sv);
for ([[maybe_unused]] auto it : view)
FAIL("Iterating a null UTF-16 string should not produce any values");
}
TEST_CASE(utf16_literal)
{
{