ladybird/Tests/AK/TestUtf8.cpp

/*
 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <LibTest/TestCase.h>

#include <AK/ByteBuffer.h>
#include <AK/Utf8View.h>

TEST_CASE(decode_ascii)
{
    Utf8View utf8 { "Hello World!11"sv };
    EXPECT(utf8.validate());

    u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
    size_t expected_size = sizeof(expected) / sizeof(expected[0]);

    size_t i = 0;
    for (u32 code_point : utf8) {
        VERIFY(i < expected_size);
        EXPECT_EQ(code_point, expected[i]);
        i++;
    }
    EXPECT_EQ(i, expected_size);
}

TEST_CASE(decode_utf8)
{
    Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv };
    size_t valid_bytes;
    EXPECT(utf8.validate(valid_bytes));
    EXPECT(valid_bytes == (size_t)utf8.byte_length());

    u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
    ByteString expected_underlying_bytes[] = { "П", "р", "и", "в", "е", "т", ",", " ", "м", "и", "р", "!", " ", "😀", " ", "γ", "ε", "ι", "ά", " ", "σ", "ο", "υ", " ", "κ", "ό", "σ", "μ", "ο", "ς", " ", "こ", "ん", "に", "ち", "は", "世", "界" };
    size_t expected_size = sizeof(expected) / sizeof(expected[0]);

    size_t i = 0;
    for (auto it = utf8.begin(); it != utf8.end(); ++it) {
        u32 code_point = *it;
        VERIFY(i < expected_size);
        EXPECT_EQ(code_point, expected[i]);
        EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
        i++;
    }
    EXPECT_EQ(i, expected_size);
}

TEST_CASE(null_view)
{
    Utf8View view;
    EXPECT(view.validate(Utf8View::AllowSurrogates::No));
    EXPECT(view.validate(Utf8View::AllowSurrogates::Yes));
    EXPECT_EQ(view.byte_length(), 0zu);
    EXPECT_EQ(view.length(), 0zu);

    for ([[maybe_unused]] auto it : view)
        FAIL("Iterating a null UTF-8 string should not produce any values");
}

TEST_CASE(validate_invalid_ut8)
{
    size_t valid_bytes;
    char invalid_utf8_1[] = { 42, 35, (char)182, 9 };
    Utf8View utf8_1 { StringView { invalid_utf8_1, 4 } };
    EXPECT(!utf8_1.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    char invalid_utf8_2[] = { 42, 35, (char)208, (char)208 };
    Utf8View utf8_2 { StringView { invalid_utf8_2, 4 } };
    EXPECT(!utf8_2.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    char invalid_utf8_3[] = { (char)208 };
    Utf8View utf8_3 { StringView { invalid_utf8_3, 1 } };
    EXPECT(!utf8_3.validate(valid_bytes));
    EXPECT(valid_bytes == 0);

    char invalid_utf8_4[] = { (char)208, 35 };
    Utf8View utf8_4 { StringView { invalid_utf8_4, 2 } };
    EXPECT(!utf8_4.validate(valid_bytes));
    EXPECT(valid_bytes == 0);

    char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0 }; // U+110000
    Utf8View utf8_5 { StringView { invalid_utf8_5, 4 } };
    EXPECT(!utf8_5.validate(valid_bytes));
    EXPECT(valid_bytes == 0);

    char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd }; // U+121c3d
    Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
    EXPECT(!utf8_6.validate(valid_bytes));
    EXPECT(valid_bytes == 0);

    char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800
    Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } };
    EXPECT(!utf8_7.validate(valid_bytes, Utf8View::AllowSurrogates::No));
    EXPECT(valid_bytes == 0);
}

TEST_CASE(validate_overlong_utf8)
{
    size_t valid_bytes = 0;

    // Overlong 2-byte encoding of U+002F
    char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) };
    Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } };
    EXPECT(!utf8_1.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    // Overlong 3-byte encoding of U+002F
    char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) };
    Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } };
    EXPECT(!utf8_2.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    // Overlong 4-byte encoding of U+002F
    char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) };
    Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } };
    EXPECT(!utf8_3.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    // Overlong 3-byte encoding of U+00FF
    char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) };
    Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } };
    EXPECT(!utf8_4.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    // Overlong 4-byte encoding of U+00FF
    char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) };
    Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } };
    EXPECT(!utf8_5.validate(valid_bytes));
    EXPECT(valid_bytes == 2);

    // Overlong 4-byte encoding of U+0FFF
    char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) };
    Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } };
    EXPECT(!utf8_6.validate(valid_bytes));
    EXPECT(valid_bytes == 2);
}

TEST_CASE(iterate_utf8)
{
    Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);
    Utf8CodePointIterator iterator = view.begin();

    EXPECT(*iterator == 'S');
    EXPECT(iterator.peek().has_value() && iterator.peek().value() == 'S');
    EXPECT(iterator.peek(0).has_value() && iterator.peek(0).value() == 'S');
    EXPECT(iterator.peek(1).has_value() && iterator.peek(1).value() == 'o');
    EXPECT(iterator.peek(22).has_value() && iterator.peek(22).value() == 0x00A9);
    EXPECT(iterator.peek(24).has_value() && iterator.peek(24).value() == 0xA755);
    EXPECT(!iterator.peek(25).has_value());

    ++iterator;

    EXPECT(*iterator == 'o');
    EXPECT(iterator.peek(23).has_value() && iterator.peek(23).value() == 0xA755);

    for (size_t i = 0; i < 23; ++i)
        ++iterator;

    EXPECT(!iterator.done());
    EXPECT(*iterator == 0xA755);
    EXPECT(iterator.peek().has_value() && iterator.peek().value() == 0xA755);
    EXPECT(!iterator.peek(1).has_value());

    ++iterator;

    EXPECT(iterator.done());
    EXPECT(!iterator.peek(0).has_value());
    EXPECT_CRASH("Dereferencing Utf8CodePointIterator which is already done.", [&iterator] {
        *iterator;
        return Test::Crash::Failure::DidNotCrash;
    });
}

TEST_CASE(decode_invalid_ut8)
{
    // Test case 1 : Getting an extension byte as first byte of the code point
    {
        char raw_data[] = { 'a', 'b', (char)0xA0, 'd' };
        Utf8View view { StringView { raw_data, 4 } };
        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
        ByteString expected_underlying_bytes[] = { "a", "b", "\xA0", "d" };
        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
        size_t i = 0;
        for (auto it = view.begin(); it != view.end(); ++it) {
            u32 code_point = *it;
            VERIFY(i < expected_size);
            EXPECT_EQ(code_point, expected_characters[i]);
            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
            i++;
        }
        VERIFY(i == expected_size);
    }

    // Test case 2 : Getting a non-extension byte when an extension byte is expected
    {
        char raw_data[] = { 'a', 'b', (char)0xC0, 'd', 'e' };
        Utf8View view { StringView { raw_data, 5 } };
        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd', 'e' };
        ByteString expected_underlying_bytes[] = { "a", "b", "\xC0", "d", "e" };
        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
        size_t i = 0;
        for (auto it = view.begin(); it != view.end(); ++it) {
            u32 code_point = *it;
            VERIFY(i < expected_size);
            EXPECT_EQ(code_point, expected_characters[i]);
            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
            i++;
        }
        VERIFY(i == expected_size);
    }

    // Test case 3 : Not enough bytes before the end of the string
    {
        char raw_data[] = { 'a', 'b', (char)0x90, 'd' };
        Utf8View view { StringView { raw_data, 4 } };
        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
        ByteString expected_underlying_bytes[] = { "a", "b", "\x90", "d" };
        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
        size_t i = 0;
        for (auto it = view.begin(); it != view.end(); ++it) {
            u32 code_point = *it;
            VERIFY(i < expected_size);
            EXPECT_EQ(code_point, expected_characters[i]);
            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
            i++;
        }
        VERIFY(i == expected_size);
    }

    // Test case 4 : Not enough bytes at the end of the string
    {
        char raw_data[] = { 'a', 'b', 'c', (char)0x90 };
        Utf8View view { StringView { raw_data, 4 } };
        u32 expected_characters[] = { 'a', 'b', 'c', 0xFFFD };
        ByteString expected_underlying_bytes[] = { "a", "b", "c", "\x90" };
        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
        size_t i = 0;
        for (auto it = view.begin(); it != view.end(); ++it) {
            u32 code_point = *it;
            VERIFY(i < expected_size);
            EXPECT_EQ(code_point, expected_characters[i]);
            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
            i++;
        }
        VERIFY(i == expected_size);
    }

    // Test case 5 : Oversized four-byte sequence (e.g. U+123456)
    {
        // Want to encode: (000)1 0010 0011 0100 0101 0110
        // Into mask: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        // Shifted:        100   100011   010001   010110
        // Result:    11110100 10100011 10010001 10010110
        char raw_data[] = { 'a', (char)0xF4, (char)0xA3, (char)0x91, (char)0x96, 'b' };
        Utf8View view { StringView { raw_data, 6 } };
        // This definition seems to suggest that we should instead output multiple replacement characters:
        // https://encoding.spec.whatwg.org/#ref-for-concept-stream-prepend②
        // This is supported by the plaintext description and example collection, which annoyingly does not give an example of how to deal with this:
        // https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf , section "U+FFFD Substitution of Maximal Subparts"
        // However, that would go against how we deal with several other kinds of errors, so we stick to emitting only one U+FFFD.
        u32 expected_characters[] = { 'a', 0xFFFD, 'b' };
        ByteString expected_underlying_bytes[] = { "a", "\xF4\xA3\x91\x96", "b" };
        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
        size_t i = 0;
        for (auto it = view.begin(); it != view.end(); ++it) {
            u32 code_point = *it;
            VERIFY(i < expected_size);
            EXPECT_EQ(code_point, expected_characters[i]);
            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
            i++;
        }
        VERIFY(i == expected_size);
    }
}

TEST_CASE(trim)
{
    Utf8View whitespace { " "sv };
    {
        Utf8View view { "word"sv };
        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
    }
    {
        Utf8View view { "   word"sv };
        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
    }
    {
        Utf8View view { "word   "sv };
        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
    }
    {
        Utf8View view { "   word   "sv };
        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
    }
    {
        Utf8View view { "\u180E"sv };
        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E");
        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E");
    }
}

static bool is_period(u32 code_point) { return code_point == '.'; }

TEST_CASE(for_each_split_view)
{
    Utf8View view { "...Well..hello.friends!..."sv };
    auto gather = [&](auto split_behavior) {
        Vector<StringView> results;
        view.for_each_split_view(is_period, split_behavior, [&](auto part) {
            results.append(part.as_string());
        });
        return results;
    };

    EXPECT_EQ(gather(SplitBehavior::Nothing),
        Vector({ "Well"sv, "hello"sv, "friends!"sv }));
    EXPECT_EQ(gather(SplitBehavior::KeepEmpty),
        Vector({ ""sv, ""sv, ""sv, "Well"sv, ""sv, "hello"sv, "friends!"sv, ""sv, ""sv, ""sv }));
    EXPECT_EQ(gather(SplitBehavior::KeepTrailingSeparator),
        Vector({ "Well."sv, "hello."sv, "friends!."sv }));
    EXPECT_EQ(gather(SplitBehavior::KeepEmpty | SplitBehavior::KeepTrailingSeparator),
        Vector({ "."sv, "."sv, "."sv, "Well."sv, "."sv, "hello."sv, "friends!."sv, "."sv, "."sv, ""sv }));
}
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 08:38:21 +00:00
+								/*
-												Meta: Claim copyright for files created by me

This changes copyright holder to myself for the source code files that I've
created or have (almost) completely rewritten. Not included are the files
that were significantly changed by others even though it was me who originally
created them (think HtmlView), or the many other files I've contributed code to.

											
										
										
											2020-01-24 13:45:29 +00:00
+								 * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 08:38:21 +00:00
+								 *
-												Everything: Move to SPDX license identifiers in all files.

SPDX License Identifiers are a more compact / standardized
way of representing file license information.

See: https://spdx.dev/resources/use/#identifiers

This was done with the `ambr` search and replace tool.

 ambr --no-parent-ignore --key-from-file --rep-from-file key.txt rep.txt *

											
										
										
											2021-04-22 08:24:48 +00:00
+								 * SPDX-License-Identifier: BSD-2-Clause
-												Meta: Add license header to source files

As suggested by Joshua, this commit adds the 2-clause BSD license as a
comment block to the top of every source file.

For the first pass, I've just added myself for simplicity. I encourage
everyone to add themselves as copyright holders of any file they've
added or modified in some significant way. If I've added myself in
error somewhere, feel free to replace it with the appropriate copyright
holder instead.

Going forward, all new source files should include a license header.

											
										
										
											2020-01-18 08:38:21 +00:00
+								 */
-												AK+Userland: Move AK/TestSuite.h into LibTest and rework Tests' CMake

As many macros as possible are moved to Macros.h, while the
macros to create a test case are moved to TestCase.h. TestCase is now
the only user-facing header for creating a test case. TestSuite and its
helpers have moved into a .cpp file. Instead of requiring a TEST_MAIN
macro to be instantiated into the test file, a TestMain.cpp file is
provided instead that will be linked against each test. This has the
side effect that, if we wanted to have test cases split across multiple
files, it's as simple as adding them all to the same executable.

The test main should be portable to kernel mode as well, so if
there's a set of tests that should be run in self-test mode in kernel
space, we can accomodate that.

A new serenity_test CMake function streamlines adding a new test with
arguments for the test source file, subdirectory under /usr/Tests to
install the test application and an optional list of libraries to link
against the test application. To accomodate future test where the
provided TestMain.cpp is not suitable (e.g. test-js), a CUSTOM_MAIN
parameter can be passed to the function to not link against the
boilerplate main function.

											
										
										
											2021-04-25 05:53:23 +00:00
+								#include <LibTest/TestCase.h>
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								#include <AK/ByteBuffer.h>
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								#include <AK/Utf8View.h>
 								TEST_CASE(decode_ascii)
 								{
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								    Utf8View utf8 { "Hello World!11"sv };
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								    EXPECT(utf8.validate());
 								    u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
 								    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
 								    size_t i = 0;
-												Unicode: Try s/codepoint/code_point/g again

This time, without trailing 's'. Ran:

    git grep -l 'codepoint' | xargs sed -ie 's/codepoint/code_point/g

											
										
										
											2020-08-05 20:31:20 +00:00
+								    for (u32 code_point : utf8) {
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								        VERIFY(i < expected_size);
-												Unicode: Try s/codepoint/code_point/g again

This time, without trailing 's'. Ran:

    git grep -l 'codepoint' | xargs sed -ie 's/codepoint/code_point/g

											
										
										
											2020-08-05 20:31:20 +00:00
+								        EXPECT_EQ(code_point, expected[i]);
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								        i++;
 								    }
 								    EXPECT_EQ(i, expected_size);
 								}
 								TEST_CASE(decode_utf8)
 								{
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								    Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv };
-												AK: Add a way to get the number of valid bytes in a Utf8View

											
										
										
											2020-05-18 09:15:18 +00:00
+								    size_t valid_bytes;
 								    EXPECT(utf8.validate(valid_bytes));
 								    EXPECT(valid_bytes == (size_t)utf8.byte_length());
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
 								    u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
-												Everywhere: Rename {Deprecated => Byte}String

This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')

											
										
										
											2023-12-16 14:19:34 +00:00
+								    ByteString expected_underlying_bytes[] = { "П", "р", "и", "в", "е", "т", ",", " ", "м", "и", "р", "!", " ", "😀", " ", "γ", "ε", "ι", "ά", " ", "σ", "ο", "υ", " ", "κ", "ό", "σ", "μ", "ο", "ς", " ", "こ", "ん", "に", "ち", "は", "世", "界" };
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								    size_t expected_size = sizeof(expected) / sizeof(expected[0]);
 								    size_t i = 0;
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								    for (auto it = utf8.begin(); it != utf8.end(); ++it) {
 								        u32 code_point = *it;
-												Everywhere: Rename ASSERT => VERIFY

(...and ASSERT_NOT_REACHED => VERIFY_NOT_REACHED)

Since all of these checks are done in release builds as well,
let's rename them to VERIFY to prevent confusion, as everyone is
used to assertions being compiled out in release.

We can introduce a new ASSERT macro that is specifically for debug
checks, but I'm doing this wholesale conversion first since we've
accumulated thousands of these already, and it's not immediately
obvious which ones are suitable for ASSERT.

											
										
										
											2021-02-23 19:42:32 +00:00
+								        VERIFY(i < expected_size);
-												Unicode: Try s/codepoint/code_point/g again

This time, without trailing 's'. Ran:

    git grep -l 'codepoint' | xargs sed -ie 's/codepoint/code_point/g

											
										
										
											2020-08-05 20:31:20 +00:00
+								        EXPECT_EQ(code_point, expected[i]);
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								        i++;
 								    }
 								    EXPECT_EQ(i, expected_size);
 								}
-												AK: Explicitly check for null data in Utf8View

The underlying CPU-specific instructions for operating on UTF-8 strings
behave differently for null inputs. Add an explicit check for this state
for consistency.

											
										
										
											2024-07-21 11:48:27 +00:00
+								TEST_CASE(null_view)
 								{
 								    Utf8View view;
 								    EXPECT(view.validate(Utf8View::AllowSurrogates::No));
 								    EXPECT(view.validate(Utf8View::AllowSurrogates::Yes));
 								    EXPECT_EQ(view.byte_length(), 0zu);
 								    EXPECT_EQ(view.length(), 0zu);
 								    for ([[maybe_unused]] auto it : view)
 								        FAIL("Iterating a null UTF-8 string should not produce any values");
 								}
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								TEST_CASE(validate_invalid_ut8)
 								{
-												AK: Add a way to get the number of valid bytes in a Utf8View

											
										
										
											2020-05-18 09:15:18 +00:00
+								    size_t valid_bytes;
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								    char invalid_utf8_1[] = { 42, 35, (char)182, 9 };
 								    Utf8View utf8_1 { StringView { invalid_utf8_1, 4 } };
-												AK: Add a way to get the number of valid bytes in a Utf8View

											
										
										
											2020-05-18 09:15:18 +00:00
+								    EXPECT(!utf8_1.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								    char invalid_utf8_2[] = { 42, 35, (char)208, (char)208 };
 								    Utf8View utf8_2 { StringView { invalid_utf8_2, 4 } };
-												AK: Add a way to get the number of valid bytes in a Utf8View

											
										
										
											2020-05-18 09:15:18 +00:00
+								    EXPECT(!utf8_2.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								    char invalid_utf8_3[] = { (char)208 };
 								    Utf8View utf8_3 { StringView { invalid_utf8_3, 1 } };
-												AK: Add a way to get the number of valid bytes in a Utf8View

											
										
										
											2020-05-18 09:15:18 +00:00
+								    EXPECT(!utf8_3.validate(valid_bytes));
 								    EXPECT(valid_bytes == 0);
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								    char invalid_utf8_4[] = { (char)208, 35 };
 								    Utf8View utf8_4 { StringView { invalid_utf8_4, 2 } };
-												AK: Add a way to get the number of valid bytes in a Utf8View

											
										
										
											2020-05-18 09:15:18 +00:00
+								    EXPECT(!utf8_4.validate(valid_bytes));
 								    EXPECT(valid_bytes == 0);
-												AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:

    /ô¡°½/

The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.

This commit adds additional validation to ensure the decoded code point
itself is also valid.

											
										
										
											2022-04-03 13:03:14 +00:00
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								    char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0 }; // U+110000
 								    Utf8View utf8_5 { StringView { invalid_utf8_5, 4 } };
-												AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:

    /ô¡°½/

The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.

This commit adds additional validation to ensure the decoded code point
itself is also valid.

											
										
										
											2022-04-03 13:03:14 +00:00
+								    EXPECT(!utf8_5.validate(valid_bytes));
 								    EXPECT(valid_bytes == 0);
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								    char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd }; // U+121c3d
 								    Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
-												AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:

    /ô¡°½/

The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.

This commit adds additional validation to ensure the decoded code point
itself is also valid.

											
										
										
											2022-04-03 13:03:14 +00:00
+								    EXPECT(!utf8_6.validate(valid_bytes));
 								    EXPECT(valid_bytes == 0);
-												AK: Add `AllowSurrogates` to UTF-8 validator

The [UTF-8](https://datatracker.ietf.org/doc/html/rfc3629#page-5)
standard says to reject strings with upper or lower surrogates. However,
in many standards, ECMAScript included, unpaired surrogates (and
therefore UTF-8 surrogates) are allowed in strings. So, this commit
extends the UTF-8 validation API with `AllowSurrogates`, which will
reject upper and lower surrogate characters.

											
										
										
											2024-06-07 14:25:39 +00:00
 								    char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800
 								    Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } };
 								    EXPECT(!utf8_7.validate(valid_bytes, Utf8View::AllowSurrogates::No));
 								    EXPECT(valid_bytes == 0);
-												AK: Add a Utf8View type for iterating over UTF-8 codepoints

Utf8View wraps a StringView and implements begin() and end() that
return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode
codepoints and returns them as 32-bit integers.

This is the first step towards supporting emojis in Serenity ^)
https://github.com/SerenityOS/serenity/issues/490

											
										
										
											2019-08-27 21:57:15 +00:00
+								}
-												AK: Implement Utf8CodepointIterator::peek(size_t)

This adds a peek method for Utf8CodepointIterator, which enables it to
be used in some parsing cases where peeking is necessary.

peek(0) is equivalent to operator*, expect that peek() does not contain
any assertions and will just return an empty Optional<u32>.

This also implements a test case for iterating UTF-8.

											
										
										
											2021-05-23 22:29:16 +00:00
-												AK: Invalidate overlong UTF-8 code point encodings

For example, the code point U+002F could be encoded as UTF-8 with the
bytes 0x80 0xAF. This trick has historically been used to bypass
security checks.

											
										
										
											2023-03-03 13:40:12 +00:00
+								TEST_CASE(validate_overlong_utf8)
 								{
 								    size_t valid_bytes = 0;
 								    // Overlong 2-byte encoding of U+002F
 								    char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) };
 								    Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } };
 								    EXPECT(!utf8_1.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
 								    // Overlong 3-byte encoding of U+002F
 								    char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) };
 								    Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } };
 								    EXPECT(!utf8_2.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
 								    // Overlong 4-byte encoding of U+002F
 								    char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) };
 								    Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } };
 								    EXPECT(!utf8_3.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
 								    // Overlong 3-byte encoding of U+00FF
 								    char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) };
 								    Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } };
 								    EXPECT(!utf8_4.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
 								    // Overlong 4-byte encoding of U+00FF
 								    char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) };
 								    Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } };
 								    EXPECT(!utf8_5.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
 								    // Overlong 4-byte encoding of U+0FFF
 								    char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) };
 								    Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } };
 								    EXPECT(!utf8_6.validate(valid_bytes));
 								    EXPECT(valid_bytes == 2);
 								}
-												AK: Implement Utf8CodepointIterator::peek(size_t)

This adds a peek method for Utf8CodepointIterator, which enables it to
be used in some parsing cases where peeking is necessary.

peek(0) is equivalent to operator*, expect that peek() does not contain
any assertions and will just return an empty Optional<u32>.

This also implements a test case for iterating UTF-8.

											
										
										
											2021-05-23 22:29:16 +00:00
+								TEST_CASE(iterate_utf8)
 								{
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								    Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);
-												AK: Rename Utf8CodepointIterator => Utf8CodePointIterator

											
										
										
											2021-06-01 07:45:52 +00:00
+								    Utf8CodePointIterator iterator = view.begin();
-												AK: Implement Utf8CodepointIterator::peek(size_t)

This adds a peek method for Utf8CodepointIterator, which enables it to
be used in some parsing cases where peeking is necessary.

peek(0) is equivalent to operator*, expect that peek() does not contain
any assertions and will just return an empty Optional<u32>.

This also implements a test case for iterating UTF-8.

											
										
										
											2021-05-23 22:29:16 +00:00
 								    EXPECT(*iterator == 'S');
 								    EXPECT(iterator.peek().has_value() && iterator.peek().value() == 'S');
 								    EXPECT(iterator.peek(0).has_value() && iterator.peek(0).value() == 'S');
 								    EXPECT(iterator.peek(1).has_value() && iterator.peek(1).value() == 'o');
 								    EXPECT(iterator.peek(22).has_value() && iterator.peek(22).value() == 0x00A9);
 								    EXPECT(iterator.peek(24).has_value() && iterator.peek(24).value() == 0xA755);
 								    EXPECT(!iterator.peek(25).has_value());
 								    ++iterator;
 								    EXPECT(*iterator == 'o');
 								    EXPECT(iterator.peek(23).has_value() && iterator.peek(23).value() == 0xA755);
 								    for (size_t i = 0; i < 23; ++i)
 								        ++iterator;
 								    EXPECT(!iterator.done());
 								    EXPECT(*iterator == 0xA755);
 								    EXPECT(iterator.peek().has_value() && iterator.peek().value() == 0xA755);
 								    EXPECT(!iterator.peek(1).has_value());
 								    ++iterator;
 								    EXPECT(iterator.done());
 								    EXPECT(!iterator.peek(0).has_value());
-												AK: Rename Utf8CodepointIterator => Utf8CodePointIterator

											
										
										
											2021-06-01 07:45:52 +00:00
+								    EXPECT_CRASH("Dereferencing Utf8CodePointIterator which is already done.", [&iterator] {
-												AK: Implement Utf8CodepointIterator::peek(size_t)

This adds a peek method for Utf8CodepointIterator, which enables it to
be used in some parsing cases where peeking is necessary.

peek(0) is equivalent to operator*, expect that peek() does not contain
any assertions and will just return an empty Optional<u32>.

This also implements a test case for iterating UTF-8.

											
										
										
											2021-05-23 22:29:16 +00:00
+								        *iterator;
 								        return Test::Crash::Failure::DidNotCrash;
 								    });
 								}
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
 								TEST_CASE(decode_invalid_ut8)
 								{
 								    // Test case 1 : Getting an extension byte as first byte of the code point
 								    {
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								        char raw_data[] = { 'a', 'b', (char)0xA0, 'd' };
 								        Utf8View view { StringView { raw_data, 4 } };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
-												Everywhere: Rename {Deprecated => Byte}String

This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')

											
										
										
											2023-12-16 14:19:34 +00:00
+								        ByteString expected_underlying_bytes[] = { "a", "b", "\xA0", "d" };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
 								        size_t i = 0;
 								        for (auto it = view.begin(); it != view.end(); ++it) {
 								            u32 code_point = *it;
 								            VERIFY(i < expected_size);
 								            EXPECT_EQ(code_point, expected_characters[i]);
 								            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
 								            i++;
 								        }
 								        VERIFY(i == expected_size);
 								    }
 								    // Test case 2 : Getting a non-extension byte when an extension byte is expected
 								    {
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								        char raw_data[] = { 'a', 'b', (char)0xC0, 'd', 'e' };
 								        Utf8View view { StringView { raw_data, 5 } };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd', 'e' };
-												Everywhere: Rename {Deprecated => Byte}String

This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')

											
										
										
											2023-12-16 14:19:34 +00:00
+								        ByteString expected_underlying_bytes[] = { "a", "b", "\xC0", "d", "e" };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
 								        size_t i = 0;
 								        for (auto it = view.begin(); it != view.end(); ++it) {
 								            u32 code_point = *it;
 								            VERIFY(i < expected_size);
 								            EXPECT_EQ(code_point, expected_characters[i]);
 								            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
 								            i++;
 								        }
 								        VERIFY(i == expected_size);
 								    }
 								    // Test case 3 : Not enough bytes before the end of the string
 								    {
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								        char raw_data[] = { 'a', 'b', (char)0x90, 'd' };
 								        Utf8View view { StringView { raw_data, 4 } };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
-												Everywhere: Rename {Deprecated => Byte}String

This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')

											
										
										
											2023-12-16 14:19:34 +00:00
+								        ByteString expected_underlying_bytes[] = { "a", "b", "\x90", "d" };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
 								        size_t i = 0;
 								        for (auto it = view.begin(); it != view.end(); ++it) {
 								            u32 code_point = *it;
 								            VERIFY(i < expected_size);
 								            EXPECT_EQ(code_point, expected_characters[i]);
 								            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
 								            i++;
 								        }
 								        VERIFY(i == expected_size);
 								    }
 								    // Test case 4 : Not enough bytes at the end of the string
 								    {
-												Everywhere: Explicitly specify the size in StringView constructors

This commit moves the length calculations out to be directly on the
StringView users. This is an important step towards the goal of removing
StringView(char const*), as it moves the responsibility of calculating
the size of the string to the user of the StringView (which will prevent
naive uses causing OOB access).

											
										
										
											2022-07-11 19:53:29 +00:00
+								        char raw_data[] = { 'a', 'b', 'c', (char)0x90 };
 								        Utf8View view { StringView { raw_data, 4 } };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        u32 expected_characters[] = { 'a', 'b', 'c', 0xFFFD };
-												Everywhere: Rename {Deprecated => Byte}String

This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')

											
										
										
											2023-12-16 14:19:34 +00:00
+								        ByteString expected_underlying_bytes[] = { "a", "b", "c", "\x90" };
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
 								        size_t i = 0;
 								        for (auto it = view.begin(); it != view.end(); ++it) {
 								            u32 code_point = *it;
 								            VERIFY(i < expected_size);
 								            EXPECT_EQ(code_point, expected_characters[i]);
 								            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
 								            i++;
 								        }
 								        VERIFY(i == expected_size);
 								    }
-												AK+Tests: Avoid creating invalid code points from malformed UTF-8

Instead of doing anything reasonable, Utf8CodePointIterator returned
invalid code points, for example U+123456. However, many callers of this
iterator assume that a code point is always at most 0x10FFFF.

In fact, this is one of two reasons for the following OSS Fuzz issue:
https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=49184
This is probably a very old bug.

In the particular case of URLParser, AK::is_url_code_point got confused:
    return /* ... */ || code_point >= 0xA0;
If code_point is a "code point" beyond 0x10FFFF, this violates the
condition given in the preceding comment, but satisfies the given
condition, which eventually causes URLParser to crash.

This commit fixes *only* the erroneous UTF-8 decoding, and does not
fully resolve OSS-Fuzz#49184.

											
										
										
											2022-09-12 16:25:20 +00:00
 								    // Test case 5 : Oversized four-byte sequence (e.g. U+123456)
 								    {
 								        // Want to encode: (000)1 0010 0011 0100 0101 0110
 								        // Into mask: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 								        // Shifted:        100   100011   010001   010110
 								        // Result:    11110100 10100011 10010001 10010110
 								        char raw_data[] = { 'a', (char)0xF4, (char)0xA3, (char)0x91, (char)0x96, 'b' };
 								        Utf8View view { StringView { raw_data, 6 } };
 								        // This definition seems to suggest that we should instead output multiple replacement characters:
 								        // https://encoding.spec.whatwg.org/#ref-for-concept-stream-prepend②
 								        // This is supported by the plaintext description and example collection, which annoyingly does not give an example of how to deal with this:
 								        // https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf , section "U+FFFD Substitution of Maximal Subparts"
 								        // However, that would go against how we deal with several other kinds of errors, so we stick to emitting only one U+FFFD.
 								        u32 expected_characters[] = { 'a', 0xFFFD, 'b' };
-												Everywhere: Rename {Deprecated => Byte}String

This commit un-deprecates DeprecatedString, and repurposes it as a byte
string.
As the null state has already been removed, there are no other
particularly hairy blockers in repurposing this type as a byte string
(what it _really_ is).

This commit is auto-generated:
  $ xs=$(ack -l \bDeprecatedString\b\|deprecated_string AK Userland \
    Meta Ports Ladybird Tests Kernel)
  $ perl -pie 's/\bDeprecatedString\b/ByteString/g;
    s/deprecated_string/byte_string/g' $xs
  $ clang-format --style=file -i \
    $(git diff --name-only | grep \.cpp\|\.h)
  $ gn format $(git ls-files '*.gn' '*.gni')

											
										
										
											2023-12-16 14:19:34 +00:00
+								        ByteString expected_underlying_bytes[] = { "a", "\xF4\xA3\x91\x96", "b" };
-												AK+Tests: Avoid creating invalid code points from malformed UTF-8

Instead of doing anything reasonable, Utf8CodePointIterator returned
invalid code points, for example U+123456. However, many callers of this
iterator assume that a code point is always at most 0x10FFFF.

In fact, this is one of two reasons for the following OSS Fuzz issue:
https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=49184
This is probably a very old bug.

In the particular case of URLParser, AK::is_url_code_point got confused:
    return /* ... */ || code_point >= 0xA0;
If code_point is a "code point" beyond 0x10FFFF, this violates the
condition given in the preceding comment, but satisfies the given
condition, which eventually causes URLParser to crash.

This commit fixes *only* the erroneous UTF-8 decoding, and does not
fully resolve OSS-Fuzz#49184.

											
										
										
											2022-09-12 16:25:20 +00:00
+								        size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
 								        size_t i = 0;
 								        for (auto it = view.begin(); it != view.end(); ++it) {
 								            u32 code_point = *it;
 								            VERIFY(i < expected_size);
 								            EXPECT_EQ(code_point, expected_characters[i]);
 								            EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
 								            i++;
 								        }
 								        VERIFY(i == expected_size);
 								    }
-												AK: Do not VERIFY on invalid code point bytes in UTF8View

The previous behavior was to always VERIFY that the UTF-8 bytes were
valid when iterating over the code points of an UTF8View. This change
makes it so we instead output the 0xFFFD 'REPLACEMENT CHARACTER'
code point when encountering invalid bytes, and keep iterating the
view after skipping one byte.

Leaving the decision to the consumer would break symmetry with the
UTF32View API, which would in turn require heavy refactoring and/or
code duplication in generic code such as the one found in
Gfx::Painter and the Shell.

To make it easier for the consumers to detect the original bytes, we
provide a new method on the iterator that returns a Span over the
data that has been decoded. This method is immediately used in the
TextNode::compute_text_for_rendering method, which previously did
this in a ad-hoc waay.

This also add tests for the new behavior in TestUtf8.cpp, as well
as reinforcements to the existing tests to check if the underlying
bytes match up with their expected values.

											
										
										
											2021-05-30 16:52:24 +00:00
+								}
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
 								TEST_CASE(trim)
 								{
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								    Utf8View whitespace { " "sv };
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
+								    {
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								        Utf8View view { "word"sv };
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
+								        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
 								    }
 								    {
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								        Utf8View view { "   word"sv };
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
+								        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
 								    }
 								    {
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								        Utf8View view { "word   "sv };
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
+								        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
 								    }
 								    {
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								        Utf8View view { "   word   "sv };
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
+								        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
 								    }
 								    {
-												AK: Make Utf8View constructors inline and remove C string constructor

Using StringView instead of C strings is basically always preferable.
The only reason to use a C string is because you are calling a C API.

											
										
										
											2021-09-18 16:02:41 +00:00
+								        Utf8View view { "\u180E"sv };
-												AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.

											
										
										
											2021-07-16 16:40:46 +00:00
+								        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E");
 								        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E");
 								    }
 								}
-												AK: Add Utf8View::for_each_split_view() method

Returns one Utf8View at a time, using a callback function to identify
code points to split on.

											
										
										
											2024-11-15 15:43:59 +00:00
 								static bool is_period(u32 code_point) { return code_point == '.'; }
 								TEST_CASE(for_each_split_view)
 								{
 								    Utf8View view { "...Well..hello.friends!..."sv };
 								    auto gather = [&](auto split_behavior) {
 								        Vector<StringView> results;
 								        view.for_each_split_view(is_period, split_behavior, [&](auto part) {
 								            results.append(part.as_string());
 								        });
 								        return results;
 								    };
 								    EXPECT_EQ(gather(SplitBehavior::Nothing),
 								        Vector({ "Well"sv, "hello"sv, "friends!"sv }));
 								    EXPECT_EQ(gather(SplitBehavior::KeepEmpty),
 								        Vector({ ""sv, ""sv, ""sv, "Well"sv, ""sv, "hello"sv, "friends!"sv, ""sv, ""sv, ""sv }));
 								    EXPECT_EQ(gather(SplitBehavior::KeepTrailingSeparator),
 								        Vector({ "Well."sv, "hello."sv, "friends!."sv }));
 								    EXPECT_EQ(gather(SplitBehavior::KeepEmpty | SplitBehavior::KeepTrailingSeparator),
 								        Vector({ "."sv, "."sv, "."sv, "Well."sv, "."sv, "hello."sv, "friends!."sv, "."sv, "."sv, ""sv }));
 								}