2020-01-18 08:38:21 +00:00
/*
2020-01-24 13:45:29 +00:00
* Copyright ( c ) 2019 - 2020 , Sergey Bugaev < bugaevc @ serenityos . org >
2021-05-17 19:04:37 +00:00
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2020-01-18 08:38:21 +00:00
*
2021-04-22 08:24:48 +00:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-01-18 08:38:21 +00:00
*/
2019-08-27 21:57:15 +00:00
# pragma once
2024-06-17 22:12:53 +00:00
# include <AK/ByteString.h>
2024-10-14 05:48:08 +00:00
# include <AK/Debug.h>
2023-02-20 13:08:40 +00:00
# include <AK/Format.h>
2019-08-27 21:57:15 +00:00
# include <AK/StringView.h>
# include <AK/Types.h>
namespace AK {
class Utf8View ;
2021-06-01 07:45:52 +00:00
class Utf8CodePointIterator {
2019-08-27 21:57:15 +00:00
friend class Utf8View ;
public :
2021-06-01 07:45:52 +00:00
Utf8CodePointIterator ( ) = default ;
~ Utf8CodePointIterator ( ) = default ;
2019-08-27 21:57:15 +00:00
2021-09-18 16:19:21 +00:00
bool operator = = ( Utf8CodePointIterator const & ) const = default ;
bool operator ! = ( Utf8CodePointIterator const & ) const = default ;
2021-06-01 07:45:52 +00:00
Utf8CodePointIterator & operator + + ( ) ;
2019-08-27 21:57:15 +00:00
u32 operator * ( ) const ;
2021-05-23 22:29:16 +00:00
// NOTE: This returns {} if the peek is at or past EOF.
Optional < u32 > peek ( size_t offset = 0 ) const ;
2019-08-27 21:57:15 +00:00
2022-04-01 17:58:27 +00:00
ssize_t operator - ( Utf8CodePointIterator const & other ) const
2020-10-20 15:47:34 +00:00
{
return m_ptr - other . m_ptr ;
}
2023-03-22 15:02:15 +00:00
u8 const * ptr ( ) const { return m_ptr ; }
2021-05-30 16:52:24 +00:00
// Note : These methods return the information about the underlying UTF-8 bytes.
// If the UTF-8 string encoding is not valid at the iterator's position, then the underlying bytes might be different from the
// decoded character's re-encoded bytes (which will be an `0xFFFD REPLACEMENT CHARACTER` with an UTF-8 length of three bytes).
// If your code relies on the decoded character being equivalent to the re-encoded character, use the `UTF8View::validate()`
// method on the view prior to using its iterator.
size_t underlying_code_point_length_in_bytes ( ) const ;
2024-10-14 05:48:08 +00:00
ReadonlyBytes underlying_code_point_bytes ( ) const { return { m_ptr , underlying_code_point_length_in_bytes ( ) } ; }
2021-01-01 23:28:24 +00:00
bool done ( ) const { return m_length = = 0 ; }
2019-10-18 20:49:23 +00:00
2019-08-27 21:57:15 +00:00
private :
2021-09-18 16:19:21 +00:00
Utf8CodePointIterator ( u8 const * ptr , size_t length )
: m_ptr ( ptr )
, m_length ( length )
{
}
u8 const * m_ptr { nullptr } ;
size_t m_length { 0 } ;
2019-08-27 21:57:15 +00:00
} ;
class Utf8View {
public :
2021-06-01 07:45:52 +00:00
using Iterator = Utf8CodePointIterator ;
2020-10-20 15:47:34 +00:00
2021-01-10 23:29:28 +00:00
Utf8View ( ) = default ;
2021-09-18 16:02:41 +00:00
2022-01-16 18:33:16 +00:00
explicit constexpr Utf8View ( StringView string )
2021-09-18 16:02:41 +00:00
: m_string ( string )
{
}
2023-12-16 14:19:34 +00:00
explicit Utf8View ( ByteString & string )
2023-03-03 14:18:14 +00:00
: m_string ( string . view ( ) )
{
}
2019-08-27 21:57:15 +00:00
2023-12-16 14:19:34 +00:00
explicit Utf8View ( ByteString & & ) = delete ;
2023-03-03 14:18:14 +00:00
2024-06-07 14:25:39 +00:00
enum class AllowSurrogates {
Yes ,
No ,
} ;
2023-03-03 14:18:14 +00:00
~ Utf8View ( ) = default ;
2021-09-04 14:53:43 +00:00
2021-11-10 23:55:02 +00:00
StringView as_string ( ) const { return m_string ; }
2019-08-27 21:57:15 +00:00
2021-09-18 16:19:21 +00:00
Utf8CodePointIterator begin ( ) const { return { begin_ptr ( ) , m_string . length ( ) } ; }
Utf8CodePointIterator end ( ) const { return { end_ptr ( ) , 0 } ; }
2021-06-01 07:45:52 +00:00
Utf8CodePointIterator iterator_at_byte_offset ( size_t ) const ;
2019-08-27 21:57:15 +00:00
2022-11-24 13:57:20 +00:00
Utf8CodePointIterator iterator_at_byte_offset_without_validation ( size_t ) const ;
2022-04-01 17:58:27 +00:00
unsigned char const * bytes ( ) const { return begin_ptr ( ) ; }
2021-05-17 18:59:15 +00:00
size_t byte_length ( ) const { return m_string . length ( ) ; }
2024-10-14 05:48:08 +00:00
[ [ nodiscard ] ] size_t byte_offset_of ( Utf8CodePointIterator const & it ) const
{
VERIFY ( it . m_ptr > = begin_ptr ( ) ) ;
VERIFY ( it . m_ptr < = end_ptr ( ) ) ;
return it . m_ptr - begin_ptr ( ) ;
}
2021-08-16 14:27:26 +00:00
size_t byte_offset_of ( size_t code_point_offset ) const ;
2021-05-17 19:04:37 +00:00
2021-09-18 16:19:21 +00:00
Utf8View substring_view ( size_t byte_offset , size_t byte_length ) const { return Utf8View { m_string . substring_view ( byte_offset , byte_length ) } ; }
2021-05-17 19:04:37 +00:00
Utf8View substring_view ( size_t byte_offset ) const { return substring_view ( byte_offset , byte_length ( ) - byte_offset ) ; }
2021-06-01 08:01:11 +00:00
Utf8View unicode_substring_view ( size_t code_point_offset , size_t code_point_length ) const ;
Utf8View unicode_substring_view ( size_t code_point_offset ) const { return unicode_substring_view ( code_point_offset , length ( ) - code_point_offset ) ; }
2019-09-04 20:40:36 +00:00
2021-05-17 19:04:37 +00:00
bool is_empty ( ) const { return m_string . is_empty ( ) ; }
2021-07-18 00:31:18 +00:00
bool is_null ( ) const { return m_string . is_null ( ) ; }
2022-04-01 17:58:27 +00:00
bool starts_with ( Utf8View const & ) const ;
2021-06-16 11:17:03 +00:00
bool contains ( u32 ) const ;
2022-04-01 17:58:27 +00:00
Utf8View trim ( Utf8View const & characters , TrimMode mode = TrimMode : : Both ) const ;
2021-03-21 20:31:15 +00:00
2022-04-01 17:58:27 +00:00
size_t iterator_offset ( Utf8CodePointIterator const & it ) const
2020-12-28 22:51:24 +00:00
{
return byte_offset_of ( it ) ;
}
2020-10-20 15:47:34 +00:00
size_t length ( ) const
{
if ( ! m_have_length ) {
m_length = calculate_length ( ) ;
m_have_length = true ;
}
return m_length ;
}
2020-05-17 11:02:27 +00:00
2024-07-16 16:54:56 +00:00
bool validate ( AllowSurrogates allow_surrogates = AllowSurrogates : : Yes ) const
2023-03-03 12:56:24 +00:00
{
size_t valid_bytes = 0 ;
2024-07-16 16:54:56 +00:00
return validate ( valid_bytes , allow_surrogates ) ;
2023-03-03 12:56:24 +00:00
}
2024-07-16 16:54:56 +00:00
bool validate ( size_t & valid_bytes , AllowSurrogates allow_surrogates = AllowSurrogates : : Yes ) const ;
2023-03-03 12:56:24 +00:00
2019-08-27 21:57:15 +00:00
private :
2023-03-03 12:56:24 +00:00
friend class Utf8CodePointIterator ;
2023-03-07 14:28:21 +00:00
u8 const * begin_ptr ( ) const { return reinterpret_cast < u8 const * > ( m_string . characters_without_null_termination ( ) ) ; }
2021-09-18 16:19:21 +00:00
u8 const * end_ptr ( ) const { return begin_ptr ( ) + m_string . length ( ) ; }
2020-10-20 15:47:34 +00:00
size_t calculate_length ( ) const ;
2019-08-27 21:57:15 +00:00
2023-03-03 12:56:24 +00:00
struct Utf8EncodedByteData {
size_t byte_length { 0 } ;
u8 encoding_bits { 0 } ;
u8 encoding_mask { 0 } ;
2023-03-03 13:40:12 +00:00
u32 first_code_point { 0 } ;
u32 last_code_point { 0 } ;
2023-03-03 12:56:24 +00:00
} ;
static constexpr Array < Utf8EncodedByteData , 4 > utf8_encoded_byte_data { {
2023-03-03 13:40:12 +00:00
{ 1 , 0 b0000 ' 0000 , 0 b1000 ' 0000 , 0x0000 , 0x007F } ,
{ 2 , 0 b1100 ' 0000 , 0 b1110 ' 0000 , 0x0080 , 0x07FF } ,
{ 3 , 0 b1110 ' 0000 , 0 b1111 ' 0000 , 0x0800 , 0xFFFF } ,
{ 4 , 0 b1111 ' 0000 , 0 b1111 ' 1000 , 0x10000 , 0x10FFFF } ,
2023-03-03 12:56:24 +00:00
} } ;
struct LeadingByte {
size_t byte_length { 0 } ;
u32 code_point_bits { 0 } ;
bool is_valid { false } ;
} ;
static constexpr LeadingByte decode_leading_byte ( u8 byte )
{
for ( auto const & data : utf8_encoded_byte_data ) {
if ( ( byte & data . encoding_mask ) ! = data . encoding_bits )
continue ;
byte & = ~ data . encoding_mask ;
return { data . byte_length , byte , true } ;
}
return { . is_valid = false } ;
}
2019-08-27 21:57:15 +00:00
StringView m_string ;
2020-10-20 15:47:34 +00:00
mutable size_t m_length { 0 } ;
mutable bool m_have_length { false } ;
2019-08-27 21:57:15 +00:00
} ;
2023-01-27 15:26:57 +00:00
class DeprecatedStringCodePointIterator {
public :
Optional < u32 > next ( )
{
if ( m_it . done ( ) )
return { } ;
auto value = * m_it ;
+ + m_it ;
return value ;
}
2023-01-28 20:07:48 +00:00
[ [ nodiscard ] ] Optional < u32 > peek ( ) const
{
if ( m_it . done ( ) )
return { } ;
return * m_it ;
}
[ [ nodiscard ] ] size_t byte_offset ( ) const
{
return Utf8View ( m_string ) . byte_offset_of ( m_it ) ;
}
2023-12-16 14:19:34 +00:00
DeprecatedStringCodePointIterator ( ByteString string )
2023-01-27 15:26:57 +00:00
: m_string ( move ( string ) )
, m_it ( Utf8View ( m_string ) . begin ( ) )
{
}
private :
2023-12-16 14:19:34 +00:00
ByteString m_string ;
2023-01-27 15:26:57 +00:00
Utf8CodePointIterator m_it ;
} ;
2023-02-20 13:08:40 +00:00
template < >
struct Formatter < Utf8View > : Formatter < StringView > {
ErrorOr < void > format ( FormatBuilder & , Utf8View const & ) ;
} ;
2024-10-14 05:48:08 +00:00
inline Utf8CodePointIterator & Utf8CodePointIterator : : operator + + ( )
{
VERIFY ( m_length > 0 ) ;
// OPTIMIZATION: Fast path for ASCII characters.
if ( * m_ptr < = 0x7F ) {
m_ptr + = 1 ;
m_length - = 1 ;
return * this ;
}
size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes ( ) ;
if ( code_point_length_in_bytes > m_length ) {
// We don't have enough data for the next code point. Skip one character and try again.
// The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards.
dbgln_if ( UTF8_DEBUG , " Expected code point size {} is too big for the remaining length {}. Moving forward one byte. " , code_point_length_in_bytes , m_length ) ;
m_ptr + = 1 ;
m_length - = 1 ;
return * this ;
}
m_ptr + = code_point_length_in_bytes ;
m_length - = code_point_length_in_bytes ;
return * this ;
}
inline size_t Utf8CodePointIterator : : underlying_code_point_length_in_bytes ( ) const
{
VERIFY ( m_length > 0 ) ;
auto [ code_point_length_in_bytes , value , first_byte_makes_sense ] = Utf8View : : decode_leading_byte ( * m_ptr ) ;
// If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1.
if ( ! first_byte_makes_sense )
return 1 ;
if ( code_point_length_in_bytes > m_length )
return 1 ;
for ( size_t offset = 1 ; offset < code_point_length_in_bytes ; offset + + ) {
if ( m_ptr [ offset ] > > 6 ! = 2 )
return 1 ;
}
return code_point_length_in_bytes ;
}
inline u32 Utf8CodePointIterator : : operator * ( ) const
{
VERIFY ( m_length > 0 ) ;
// OPTIMIZATION: Fast path for ASCII characters.
if ( * m_ptr < = 0x7F )
return * m_ptr ;
auto [ code_point_length_in_bytes , code_point_value_so_far , first_byte_makes_sense ] = Utf8View : : decode_leading_byte ( * m_ptr ) ;
if ( ! first_byte_makes_sense ) {
// The first byte of the code point doesn't make sense: output a replacement character
dbgln_if ( UTF8_DEBUG , " First byte doesn't make sense: {:#02x}. " , m_ptr [ 0 ] ) ;
return 0xFFFD ;
}
if ( code_point_length_in_bytes > m_length ) {
// There is not enough data left for the full code point: output a replacement character
dbgln_if ( UTF8_DEBUG , " Not enough bytes (need {}, have {}), first byte is: {:#02x}. " , code_point_length_in_bytes , m_length , m_ptr [ 0 ] ) ;
return 0xFFFD ;
}
for ( size_t offset = 1 ; offset < code_point_length_in_bytes ; offset + + ) {
if ( m_ptr [ offset ] > > 6 ! = 2 ) {
// One of the extension bytes of the code point doesn't make sense: output a replacement character
dbgln_if ( UTF8_DEBUG , " Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense. " , m_ptr [ offset ] , offset , m_ptr [ 0 ] ) ;
return 0xFFFD ;
}
code_point_value_so_far < < = 6 ;
code_point_value_so_far | = m_ptr [ offset ] & 63 ;
}
if ( code_point_value_so_far > 0x10FFFF ) {
dbgln_if ( UTF8_DEBUG , " Multi-byte sequence is otherwise valid, but code point {:#x} is not permissible. " , code_point_value_so_far ) ;
return 0xFFFD ;
}
return code_point_value_so_far ;
}
2019-08-27 21:57:15 +00:00
}
2022-11-26 11:18:30 +00:00
# if USING_AK_GLOBALLY
2023-01-27 15:26:57 +00:00
using AK : : DeprecatedStringCodePointIterator ;
2021-06-01 07:45:52 +00:00
using AK : : Utf8CodePointIterator ;
2019-08-27 21:57:15 +00:00
using AK : : Utf8View ;
2022-11-26 11:18:30 +00:00
# endif