11 months ago · a2bcb2ab8d
--- a/AK/Utf8View.cpp
+++ b/AK/Utf8View.cpp
@@ -5,11 +5,15 @@
 
				  * SPDX-License-Identifier: BSD-2-Clause
			
 
				  */
			
 
				 
			
 
				+#define AK_DONT_REPLACE_STD
			
 
				+
			
 
				 #include <AK/Assertions.h>
			
 
				 #include <AK/Debug.h>
			
 
				 #include <AK/Format.h>
			
 
				 #include <AK/Utf8View.h>
			
 
				 
			
 
				+#include <simdutf.h>
			
 
				+
			
 
				 namespace AK {
			
 
				 
			
 
				 Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const
			
@@ -72,6 +76,12 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_
 
				 
			
 
				 size_t Utf8View::calculate_length() const
			
 
				 {
			
 
				+    // FIXME: simdutf's code point length method assumes valid UTF-8, whereas Utf8View uses U+FFFD as a replacement
			
 
				+    //        for invalid code points. If we change Utf8View to only accept valid encodings as an invariant, we can
			
 
				+    //        remove this branch.
			
 
				+    if (validate()) [[likely]]
			
 
				+        return simdutf::count_utf8(m_string.characters_without_null_termination(), m_string.length());
			
 
				+
			
 
				     size_t length = 0;
			
 
				 
			
 
				     for (size_t i = 0; i < m_string.length(); ++length) {
			
@@ -143,6 +153,24 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
 
				     return substring_view(substring_start, substring_length);
			
 
				 }
			
 
				 
			
 
				+bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const
			
 
				+{
			
 
				+    auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length());
			
 
				+    valid_bytes = result.count;
			
 
				+
			
 
				+    if (result.error == simdutf::SURROGATE && allow_surrogates == AllowSurrogates::Yes) {
			
 
				+        valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3.
			
 
				+
			
 
				+        size_t substring_valid_bytes = 0;
			
 
				+        auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_surrogates);
			
 
				+
			
 
				+        valid_bytes += substring_valid_bytes;
			
 
				+        return is_valid;
			
 
				+    }
			
 
				+
			
 
				+    return result.error == simdutf::SUCCESS;
			
 
				+}
			
 
				+
			
 
				 Utf8CodePointIterator& Utf8CodePointIterator::operator++()
			
 
				 {
			
 
				     VERIFY(m_length > 0);
			
--- a/AK/Utf8View.h
+++ b/AK/Utf8View.h
@@ -121,41 +121,13 @@ public:
 
				         return m_length;
			
 
				     }
			
 
				 
			
 
				-    constexpr bool validate(AllowSurrogates surrogates = AllowSurrogates::Yes) const
			
 
				+    bool validate(AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const
			
 
				     {
			
 
				         size_t valid_bytes = 0;
			
 
				-        return validate(valid_bytes, surrogates);
			
 
				+        return validate(valid_bytes, allow_surrogates);
			
 
				     }
			
 
				 
			
 
				-    constexpr bool validate(size_t& valid_bytes, AllowSurrogates surrogates = AllowSurrogates::Yes) const
			
 
				-    {
			
 
				-        valid_bytes = 0;
			
 
				-
			
 
				-        for (auto it = m_string.begin(); it != m_string.end(); ++it) {
			
 
				-            auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(*it));
			
 
				-            if (!is_valid)
			
 
				-                return false;
			
 
				-
			
 
				-            for (size_t i = 1; i < byte_length; ++i) {
			
 
				-                if (++it == m_string.end())
			
 
				-                    return false;
			
 
				-
			
 
				-                auto [code_point_bits, is_valid] = decode_continuation_byte(static_cast<u8>(*it));
			
 
				-                if (!is_valid)
			
 
				-                    return false;
			
 
				-
			
 
				-                code_point <<= 6;
			
 
				-                code_point |= code_point_bits;
			
 
				-            }
			
 
				-
			
 
				-            if (!is_valid_code_point(code_point, byte_length, surrogates))
			
 
				-                return false;
			
 
				-
			
 
				-            valid_bytes += byte_length;
			
 
				-        }
			
 
				-
			
 
				-        return true;
			
 
				-    }
			
 
				+    bool validate(size_t& valid_bytes, AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const;
			
 
				 
			
 
				 private:
			
 
				     friend class Utf8CodePointIterator;
			
@@ -198,36 +170,6 @@ private:
 
				         return { .is_valid = false };
			
 
				     }
			
 
				 
			
 
				-    struct ContinuationByte {
			
 
				-        u32 code_point_bits { 0 };
			
 
				-        bool is_valid { false };
			
 
				-    };
			
 
				-
			
 
				-    static constexpr ContinuationByte decode_continuation_byte(u8 byte)
			
 
				-    {
			
 
				-        constexpr u8 continuation_byte_encoding_bits = 0b1000'0000;
			
 
				-        constexpr u8 continuation_byte_encoding_mask = 0b1100'0000;
			
 
				-
			
 
				-        if ((byte & continuation_byte_encoding_mask) == continuation_byte_encoding_bits) {
			
 
				-            byte &= ~continuation_byte_encoding_mask;
			
 
				-            return { byte, true };
			
 
				-        }
			
 
				-
			
 
				-        return { .is_valid = false };
			
 
				-    }
			
 
				-
			
 
				-    static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length, AllowSurrogates surrogates = AllowSurrogates::Yes)
			
 
				-    {
			
 
				-        if (surrogates == AllowSurrogates::No && byte_length == 3 && code_point >= 0xD800 && code_point <= 0xDFFF)
			
 
				-            return false;
			
 
				-        for (auto const& data : utf8_encoded_byte_data) {
			
 
				-            if (code_point >= data.first_code_point && code_point <= data.last_code_point)
			
 
				-                return byte_length == data.byte_length;
			
 
				-        }
			
 
				-
			
 
				-        return false;
			
 
				-    }
			
 
				-
			
 
				     StringView m_string;
			
 
				     mutable size_t m_length { 0 };
			
 
				     mutable bool m_have_length { false };