mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-21 23:20:20 +00:00
AK: Add a Utf8View type for iterating over UTF-8 codepoints
Utf8View wraps a StringView and implements begin() and end() that return a Utf8CodepointIterator, which parses UTF-8-encoded Unicode codepoints and returns them as 32-bit integers. This is the first step towards supporting emojis in Serenity ^) https://github.com/SerenityOS/serenity/issues/490
This commit is contained in:
parent
970e0147f7
commit
5d3696174b
Notes:
sideshowbarker
2024-07-19 12:29:00 +09:00
Author: https://github.com/bugaevc Commit: https://github.com/SerenityOS/serenity/commit/5d3696174be Pull-request: https://github.com/SerenityOS/serenity/pull/494 Reviewed-by: https://github.com/awesomekling
4 changed files with 241 additions and 1 deletions
|
@ -1,4 +1,4 @@
|
|||
PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView
|
||||
PROGRAMS = TestString TestQueue TestVector TestHashMap TestJSON TestWeakPtr TestNonnullRefPtr TestRefPtr TestFixedArray TestFileSystemPath TestURL TestStringView TestUtf8
|
||||
|
||||
CXXFLAGS = -std=c++17 -Wall -Wextra -ggdb3 -O2 -I../ -I../../
|
||||
|
||||
|
@ -14,6 +14,7 @@ SHARED_TEST_OBJS = \
|
|||
../JsonParser.o \
|
||||
../FileSystemPath.o \
|
||||
../URL.o \
|
||||
../Utf8View.o \
|
||||
|
||||
.cpp.o:
|
||||
@echo "HOST_CXX $<"; $(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ -c $<
|
||||
|
@ -65,6 +66,9 @@ TestURL: TestURL.o $(SHARED_TEST_OBJS)
|
|||
TestStringView: TestStringView.o $(SHARED_TEST_OBJS)
|
||||
$(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestStringView.o $(SHARED_TEST_OBJS)
|
||||
|
||||
TestUtf8: TestUtf8.o $(SHARED_TEST_OBJS)
|
||||
$(PRE_CXX) $(CXX) $(CXXFLAGS) -o $@ TestUtf8.o $(SHARED_TEST_OBJS)
|
||||
|
||||
clean:
|
||||
rm -f $(SHARED_TEST_OBJS)
|
||||
rm -f $(PROGRAMS)
|
||||
|
|
58
AK/Tests/TestUtf8.cpp
Normal file
58
AK/Tests/TestUtf8.cpp
Normal file
|
@ -0,0 +1,58 @@
|
|||
#include <AK/TestSuite.h>
|
||||
|
||||
#include <AK/Utf8View.h>
|
||||
|
||||
TEST_CASE(decode_ascii)
|
||||
{
|
||||
Utf8View utf8 { "Hello World!11" };
|
||||
EXPECT(utf8.validate());
|
||||
|
||||
u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
|
||||
size_t expected_size = sizeof(expected) / sizeof(expected[0]);
|
||||
|
||||
size_t i = 0;
|
||||
for (u32 codepoint : utf8) {
|
||||
ASSERT(i < expected_size);
|
||||
EXPECT_EQ(codepoint, expected[i]);
|
||||
i++;
|
||||
}
|
||||
EXPECT_EQ(i, expected_size);
|
||||
}
|
||||
|
||||
TEST_CASE(decode_utf8)
|
||||
{
|
||||
Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界" };
|
||||
EXPECT(utf8.validate());
|
||||
|
||||
u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
|
||||
size_t expected_size = sizeof(expected) / sizeof(expected[0]);
|
||||
|
||||
size_t i = 0;
|
||||
for (u32 codepoint : utf8) {
|
||||
ASSERT(i < expected_size);
|
||||
EXPECT_EQ(codepoint, expected[i]);
|
||||
i++;
|
||||
}
|
||||
EXPECT_EQ(i, expected_size);
|
||||
}
|
||||
|
||||
TEST_CASE(validate_invalid_ut8)
|
||||
{
|
||||
char invalid_utf8_1[] = { 42, 35, (char)182, 9, 0 };
|
||||
Utf8View utf8_1 { invalid_utf8_1 };
|
||||
EXPECT(!utf8_1.validate());
|
||||
|
||||
char invalid_utf8_2[] = { 42, 35, (char)208, (char)208, 0 };
|
||||
Utf8View utf8_2 { invalid_utf8_2 };
|
||||
EXPECT(!utf8_2.validate());
|
||||
|
||||
char invalid_utf8_3[] = { (char)208, 0 };
|
||||
Utf8View utf8_3 { invalid_utf8_3 };
|
||||
EXPECT(!utf8_3.validate());
|
||||
|
||||
char invalid_utf8_4[] = { (char)208, 35, 0 };
|
||||
Utf8View utf8_4 { invalid_utf8_4 };
|
||||
EXPECT(!utf8_4.validate());
|
||||
}
|
||||
|
||||
TEST_MAIN(UTF8)
|
130
AK/Utf8View.cpp
Normal file
130
AK/Utf8View.cpp
Normal file
|
@ -0,0 +1,130 @@
|
|||
#include <AK/Utf8View.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
Utf8View::Utf8View(const StringView& string)
|
||||
: m_string(string)
|
||||
{
|
||||
}
|
||||
|
||||
const unsigned char* Utf8View::begin_ptr() const
|
||||
{
|
||||
return (const unsigned char*)m_string.characters_without_null_termination();
|
||||
}
|
||||
|
||||
const unsigned char* Utf8View::end_ptr() const
|
||||
{
|
||||
return (const unsigned char*)m_string.characters_without_null_termination() + m_string.length();
|
||||
}
|
||||
|
||||
Utf8CodepointIterator Utf8View::begin() const
|
||||
{
|
||||
return { begin_ptr(), m_string.length() };
|
||||
}
|
||||
|
||||
Utf8CodepointIterator Utf8View::end() const
|
||||
{
|
||||
return { end_ptr(), 0 };
|
||||
}
|
||||
|
||||
static inline bool decode_first_byte(
|
||||
unsigned char byte,
|
||||
int& out_codepoint_length_in_bytes,
|
||||
u32& out_value)
|
||||
{
|
||||
if ((byte & 128) == 0) {
|
||||
out_value = byte;
|
||||
out_codepoint_length_in_bytes = 1;
|
||||
return true;
|
||||
}
|
||||
if ((byte & 64) == 0) {
|
||||
return false;
|
||||
}
|
||||
if ((byte & 32) == 0) {
|
||||
out_value = byte & 31;
|
||||
out_codepoint_length_in_bytes = 2;
|
||||
return true;
|
||||
}
|
||||
if ((byte & 16) == 0) {
|
||||
out_value = byte & 15;
|
||||
out_codepoint_length_in_bytes = 3;
|
||||
return true;
|
||||
}
|
||||
if ((byte & 8) == 0) {
|
||||
out_value = byte & 7;
|
||||
out_codepoint_length_in_bytes = 4;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool Utf8View::validate() const
|
||||
{
|
||||
for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
|
||||
int codepoint_length_in_bytes;
|
||||
u32 value;
|
||||
bool first_byte_makes_sense = decode_first_byte(*ptr, codepoint_length_in_bytes, value);
|
||||
if (!first_byte_makes_sense)
|
||||
return false;
|
||||
|
||||
for (int i = 1; i < codepoint_length_in_bytes; i++) {
|
||||
ptr++;
|
||||
if (ptr >= end_ptr())
|
||||
return false;
|
||||
if (*ptr >> 6 != 2)
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Utf8CodepointIterator::Utf8CodepointIterator(const unsigned char* ptr, int length)
|
||||
: m_ptr(ptr)
|
||||
, m_length(length)
|
||||
{
|
||||
}
|
||||
|
||||
bool Utf8CodepointIterator::operator==(const Utf8CodepointIterator& other) const
|
||||
{
|
||||
return m_ptr == other.m_ptr && m_length == other.m_length;
|
||||
}
|
||||
|
||||
bool Utf8CodepointIterator::operator!=(const Utf8CodepointIterator& other) const
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
Utf8CodepointIterator& Utf8CodepointIterator::operator++()
|
||||
{
|
||||
do {
|
||||
ASSERT(m_length > 0);
|
||||
m_length--;
|
||||
m_ptr++;
|
||||
} while (m_ptr[0] >> 6 == 2);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
u32 Utf8CodepointIterator::operator*() const
|
||||
{
|
||||
ASSERT(m_length > 0);
|
||||
|
||||
u32 codepoint_value_so_far;
|
||||
int codepoint_length_in_bytes;
|
||||
|
||||
bool first_byte_makes_sense = decode_first_byte(m_ptr[0], codepoint_length_in_bytes, codepoint_value_so_far);
|
||||
ASSERT(first_byte_makes_sense);
|
||||
ASSERT(codepoint_length_in_bytes <= m_length);
|
||||
|
||||
for (int offset = 1; offset < codepoint_length_in_bytes; offset++) {
|
||||
ASSERT(m_ptr[offset] >> 6 == 2);
|
||||
codepoint_value_so_far <<= 6;
|
||||
codepoint_value_so_far |= m_ptr[offset] & 63;
|
||||
}
|
||||
|
||||
return codepoint_value_so_far;
|
||||
}
|
||||
|
||||
}
|
48
AK/Utf8View.h
Normal file
48
AK/Utf8View.h
Normal file
|
@ -0,0 +1,48 @@
|
|||
#pragma once
|
||||
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Types.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
class Utf8View;
|
||||
|
||||
class Utf8CodepointIterator {
|
||||
friend class Utf8View;
|
||||
|
||||
public:
|
||||
~Utf8CodepointIterator() {}
|
||||
|
||||
bool operator==(const Utf8CodepointIterator&) const;
|
||||
bool operator!=(const Utf8CodepointIterator&) const;
|
||||
Utf8CodepointIterator& operator++();
|
||||
u32 operator*() const;
|
||||
|
||||
private:
|
||||
Utf8CodepointIterator(const unsigned char*, int);
|
||||
const unsigned char* m_ptr { nullptr };
|
||||
int m_length { -1 };
|
||||
};
|
||||
|
||||
class Utf8View {
|
||||
public:
|
||||
explicit Utf8View(const StringView&);
|
||||
~Utf8View() {}
|
||||
|
||||
const StringView& as_string() const { return m_string; }
|
||||
|
||||
Utf8CodepointIterator begin() const;
|
||||
Utf8CodepointIterator end() const;
|
||||
|
||||
bool validate() const;
|
||||
|
||||
private:
|
||||
const unsigned char* begin_ptr() const;
|
||||
const unsigned char* end_ptr() const;
|
||||
|
||||
StringView m_string;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
using AK::Utf8View;
|
Loading…
Reference in a new issue