Utf8View.cpp 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289
  1. /*
  2. * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Assertions.h>
  8. #include <AK/Format.h>
  9. #include <AK/Utf8View.h>
  10. namespace AK {
  11. Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const
  12. {
  13. size_t current_offset = 0;
  14. for (auto iterator = begin(); !iterator.done(); ++iterator) {
  15. if (current_offset >= byte_offset)
  16. return iterator;
  17. current_offset += iterator.underlying_code_point_length_in_bytes();
  18. }
  19. return end();
  20. }
  21. size_t Utf8View::byte_offset_of(const Utf8CodePointIterator& it) const
  22. {
  23. VERIFY(it.m_ptr >= begin_ptr());
  24. VERIFY(it.m_ptr <= end_ptr());
  25. return it.m_ptr - begin_ptr();
  26. }
  27. size_t Utf8View::byte_offset_of(size_t code_point_offset) const
  28. {
  29. size_t byte_offset = 0;
  30. for (auto it = begin(); !it.done(); ++it) {
  31. if (code_point_offset == 0)
  32. return byte_offset;
  33. byte_offset += it.underlying_code_point_length_in_bytes();
  34. --code_point_offset;
  35. }
  36. return byte_offset;
  37. }
  38. Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
  39. {
  40. if (code_point_length == 0)
  41. return {};
  42. size_t code_point_index = 0, offset_in_bytes = 0;
  43. for (auto iterator = begin(); !iterator.done(); ++iterator) {
  44. if (code_point_index == code_point_offset)
  45. offset_in_bytes = byte_offset_of(iterator);
  46. if (code_point_index == code_point_offset + code_point_length - 1) {
  47. size_t length_in_bytes = byte_offset_of(++iterator) - offset_in_bytes;
  48. return substring_view(offset_in_bytes, length_in_bytes);
  49. }
  50. ++code_point_index;
  51. }
  52. VERIFY_NOT_REACHED();
  53. }
  54. static inline bool decode_first_byte(
  55. unsigned char byte,
  56. size_t& out_code_point_length_in_bytes,
  57. u32& out_value)
  58. {
  59. if ((byte & 128) == 0) {
  60. out_value = byte;
  61. out_code_point_length_in_bytes = 1;
  62. return true;
  63. }
  64. if ((byte & 64) == 0) {
  65. return false;
  66. }
  67. if ((byte & 32) == 0) {
  68. out_value = byte & 31;
  69. out_code_point_length_in_bytes = 2;
  70. return true;
  71. }
  72. if ((byte & 16) == 0) {
  73. out_value = byte & 15;
  74. out_code_point_length_in_bytes = 3;
  75. return true;
  76. }
  77. if ((byte & 8) == 0) {
  78. out_value = byte & 7;
  79. out_code_point_length_in_bytes = 4;
  80. return true;
  81. }
  82. return false;
  83. }
  84. bool Utf8View::validate(size_t& valid_bytes) const
  85. {
  86. valid_bytes = 0;
  87. for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
  88. size_t code_point_length_in_bytes;
  89. u32 value;
  90. bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, value);
  91. if (!first_byte_makes_sense)
  92. return false;
  93. for (size_t i = 1; i < code_point_length_in_bytes; i++) {
  94. ptr++;
  95. if (ptr >= end_ptr())
  96. return false;
  97. if (*ptr >> 6 != 2)
  98. return false;
  99. }
  100. valid_bytes += code_point_length_in_bytes;
  101. }
  102. return true;
  103. }
  104. size_t Utf8View::calculate_length() const
  105. {
  106. size_t length = 0;
  107. for ([[maybe_unused]] auto code_point : *this) {
  108. ++length;
  109. }
  110. return length;
  111. }
  112. bool Utf8View::starts_with(const Utf8View& start) const
  113. {
  114. if (start.is_empty())
  115. return true;
  116. if (is_empty())
  117. return false;
  118. if (start.length() > length())
  119. return false;
  120. if (begin_ptr() == start.begin_ptr())
  121. return true;
  122. for (auto k = begin(), l = start.begin(); l != start.end(); ++k, ++l) {
  123. if (*k != *l)
  124. return false;
  125. }
  126. return true;
  127. }
  128. bool Utf8View::contains(u32 needle) const
  129. {
  130. for (u32 code_point : *this) {
  131. if (code_point == needle)
  132. return true;
  133. }
  134. return false;
  135. }
  136. Utf8View Utf8View::trim(const Utf8View& characters, TrimMode mode) const
  137. {
  138. size_t substring_start = 0;
  139. size_t substring_length = byte_length();
  140. if (mode == TrimMode::Left || mode == TrimMode::Both) {
  141. for (auto code_point = begin(); code_point != end(); ++code_point) {
  142. if (substring_length == 0)
  143. return {};
  144. if (!characters.contains(*code_point))
  145. break;
  146. substring_start += code_point.underlying_code_point_length_in_bytes();
  147. substring_length -= code_point.underlying_code_point_length_in_bytes();
  148. }
  149. }
  150. if (mode == TrimMode::Right || mode == TrimMode::Both) {
  151. size_t seen_whitespace_length = 0;
  152. for (auto code_point = begin(); code_point != end(); ++code_point) {
  153. if (characters.contains(*code_point))
  154. seen_whitespace_length += code_point.underlying_code_point_length_in_bytes();
  155. else
  156. seen_whitespace_length = 0;
  157. }
  158. if (seen_whitespace_length >= substring_length)
  159. return {};
  160. substring_length -= seen_whitespace_length;
  161. }
  162. return substring_view(substring_start, substring_length);
  163. }
  164. Utf8CodePointIterator& Utf8CodePointIterator::operator++()
  165. {
  166. VERIFY(m_length > 0);
  167. size_t code_point_length_in_bytes = underlying_code_point_length_in_bytes();
  168. if (code_point_length_in_bytes > m_length) {
  169. // We don't have enough data for the next code point. Skip one character and try again.
  170. // The rest of the code will output replacement characters as needed for any eventual extension bytes we might encounter afterwards.
  171. dbgln("Expected code point size {} is too big for the remaining length {}. Moving forward one byte.", code_point_length_in_bytes, m_length);
  172. m_ptr += 1;
  173. m_length -= 1;
  174. return *this;
  175. }
  176. m_ptr += code_point_length_in_bytes;
  177. m_length -= code_point_length_in_bytes;
  178. return *this;
  179. }
  180. size_t Utf8CodePointIterator::underlying_code_point_length_in_bytes() const
  181. {
  182. VERIFY(m_length > 0);
  183. size_t code_point_length_in_bytes = 0;
  184. u32 value;
  185. bool first_byte_makes_sense = decode_first_byte(*m_ptr, code_point_length_in_bytes, value);
  186. // If any of these tests fail, we will output a replacement character for this byte and treat it as a code point of size 1.
  187. if (!first_byte_makes_sense)
  188. return 1;
  189. if (code_point_length_in_bytes > m_length)
  190. return 1;
  191. for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) {
  192. if (m_ptr[offset] >> 6 != 2)
  193. return 1;
  194. }
  195. return code_point_length_in_bytes;
  196. }
  197. ReadonlyBytes Utf8CodePointIterator::underlying_code_point_bytes() const
  198. {
  199. return { m_ptr, underlying_code_point_length_in_bytes() };
  200. }
  201. u32 Utf8CodePointIterator::operator*() const
  202. {
  203. VERIFY(m_length > 0);
  204. u32 code_point_value_so_far = 0;
  205. size_t code_point_length_in_bytes = 0;
  206. bool first_byte_makes_sense = decode_first_byte(m_ptr[0], code_point_length_in_bytes, code_point_value_so_far);
  207. if (!first_byte_makes_sense) {
  208. // The first byte of the code point doesn't make sense: output a replacement character
  209. dbgln("First byte doesn't make sense: {:#02x}.", m_ptr[0]);
  210. return 0xFFFD;
  211. }
  212. if (code_point_length_in_bytes > m_length) {
  213. // There is not enough data left for the full code point: output a replacement character
  214. dbgln("Not enough bytes (need {}, have {}), first byte is: {:#02x}.", code_point_length_in_bytes, m_length, m_ptr[0]);
  215. return 0xFFFD;
  216. }
  217. for (size_t offset = 1; offset < code_point_length_in_bytes; offset++) {
  218. if (m_ptr[offset] >> 6 != 2) {
  219. // One of the extension bytes of the code point doesn't make sense: output a replacement character
  220. dbgln("Extension byte {:#02x} in {} position after first byte {:#02x} doesn't make sense.", m_ptr[offset], offset, m_ptr[0]);
  221. return 0xFFFD;
  222. }
  223. code_point_value_so_far <<= 6;
  224. code_point_value_so_far |= m_ptr[offset] & 63;
  225. }
  226. return code_point_value_so_far;
  227. }
  228. Optional<u32> Utf8CodePointIterator::peek(size_t offset) const
  229. {
  230. if (offset == 0) {
  231. if (this->done())
  232. return {};
  233. return this->operator*();
  234. }
  235. auto new_iterator = *this;
  236. for (size_t index = 0; index < offset; ++index) {
  237. ++new_iterator;
  238. if (new_iterator.done())
  239. return {};
  240. }
  241. return *new_iterator;
  242. }
  243. }