mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-12-02 12:30:31 +00:00
AK: Compute UTF-8 code point lengths using only leading bytes
We don't need to decode the entire code point to know its length. This reduces the runtime of decoding a string containing 5 million instances of U+10FFFF from over 4 seconds to 0.9 seconds.
This commit is contained in:
parent
516d2f4892
commit
1d5b45f7d9
Notes:
sideshowbarker
2024-07-16 22:59:11 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/1d5b45f7d9 Pull-request: https://github.com/SerenityOS/serenity/pull/17838 Reviewed-by: https://github.com/kleinesfilmroellchen ✅
1 changed files with 7 additions and 2 deletions
|
@ -73,9 +73,14 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_
|
|||
size_t Utf8View::calculate_length() const
|
||||
{
|
||||
size_t length = 0;
|
||||
for ([[maybe_unused]] auto code_point : *this) {
|
||||
++length;
|
||||
|
||||
for (size_t i = 0; i < m_string.length(); ++length) {
|
||||
auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast<u8>(m_string[i]));
|
||||
|
||||
// Similar to Utf8CodePointIterator::operator++, if the byte is invalid, try the next byte.
|
||||
i += is_valid ? byte_length : 1;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue