TestUtf8.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. /*
  2. * Copyright (c) 2019-2020, Sergey Bugaev <bugaevc@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <AK/ByteBuffer.h>
  8. #include <AK/Utf8View.h>
  9. TEST_CASE(decode_ascii)
  10. {
  11. Utf8View utf8 { "Hello World!11"sv };
  12. EXPECT(utf8.validate());
  13. u32 expected[] = { 72, 101, 108, 108, 111, 32, 87, 111, 114, 108, 100, 33, 49, 49 };
  14. size_t expected_size = sizeof(expected) / sizeof(expected[0]);
  15. size_t i = 0;
  16. for (u32 code_point : utf8) {
  17. VERIFY(i < expected_size);
  18. EXPECT_EQ(code_point, expected[i]);
  19. i++;
  20. }
  21. EXPECT_EQ(i, expected_size);
  22. }
  23. TEST_CASE(decode_utf8)
  24. {
  25. Utf8View utf8 { "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv };
  26. size_t valid_bytes;
  27. EXPECT(utf8.validate(valid_bytes));
  28. EXPECT(valid_bytes == (size_t)utf8.byte_length());
  29. u32 expected[] = { 1055, 1088, 1080, 1074, 1077, 1090, 44, 32, 1084, 1080, 1088, 33, 32, 128512, 32, 947, 949, 953, 940, 32, 963, 959, 965, 32, 954, 972, 963, 956, 959, 962, 32, 12371, 12435, 12395, 12385, 12399, 19990, 30028 };
  30. DeprecatedString expected_underlying_bytes[] = { "П", "р", "и", "в", "е", "т", ",", " ", "м", "и", "р", "!", " ", "😀", " ", "γ", "ε", "ι", "ά", " ", "σ", "ο", "υ", " ", "κ", "ό", "σ", "μ", "ο", "ς", " ", "こ", "ん", "に", "ち", "は", "世", "界" };
  31. size_t expected_size = sizeof(expected) / sizeof(expected[0]);
  32. size_t i = 0;
  33. for (auto it = utf8.begin(); it != utf8.end(); ++it) {
  34. u32 code_point = *it;
  35. VERIFY(i < expected_size);
  36. EXPECT_EQ(code_point, expected[i]);
  37. EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
  38. i++;
  39. }
  40. EXPECT_EQ(i, expected_size);
  41. }
  42. TEST_CASE(validate_invalid_ut8)
  43. {
  44. size_t valid_bytes;
  45. char invalid_utf8_1[] = { 42, 35, (char)182, 9 };
  46. Utf8View utf8_1 { StringView { invalid_utf8_1, 4 } };
  47. EXPECT(!utf8_1.validate(valid_bytes));
  48. EXPECT(valid_bytes == 2);
  49. char invalid_utf8_2[] = { 42, 35, (char)208, (char)208 };
  50. Utf8View utf8_2 { StringView { invalid_utf8_2, 4 } };
  51. EXPECT(!utf8_2.validate(valid_bytes));
  52. EXPECT(valid_bytes == 2);
  53. char invalid_utf8_3[] = { (char)208 };
  54. Utf8View utf8_3 { StringView { invalid_utf8_3, 1 } };
  55. EXPECT(!utf8_3.validate(valid_bytes));
  56. EXPECT(valid_bytes == 0);
  57. char invalid_utf8_4[] = { (char)208, 35 };
  58. Utf8View utf8_4 { StringView { invalid_utf8_4, 2 } };
  59. EXPECT(!utf8_4.validate(valid_bytes));
  60. EXPECT(valid_bytes == 0);
  61. char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0 }; // U+110000
  62. Utf8View utf8_5 { StringView { invalid_utf8_5, 4 } };
  63. EXPECT(!utf8_5.validate(valid_bytes));
  64. EXPECT(valid_bytes == 0);
  65. char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd }; // U+121c3d
  66. Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } };
  67. EXPECT(!utf8_6.validate(valid_bytes));
  68. EXPECT(valid_bytes == 0);
  69. }
  70. TEST_CASE(validate_overlong_utf8)
  71. {
  72. size_t valid_bytes = 0;
  73. // Overlong 2-byte encoding of U+002F
  74. char invalid_utf8_1[] = { 42, 35, static_cast<char>(0xc0), static_cast<char>(0xaf) };
  75. Utf8View utf8_1 { StringView { invalid_utf8_1, sizeof(invalid_utf8_1) } };
  76. EXPECT(!utf8_1.validate(valid_bytes));
  77. EXPECT(valid_bytes == 2);
  78. // Overlong 3-byte encoding of U+002F
  79. char invalid_utf8_2[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x80), static_cast<char>(0xaf) };
  80. Utf8View utf8_2 { StringView { invalid_utf8_2, sizeof(invalid_utf8_2) } };
  81. EXPECT(!utf8_2.validate(valid_bytes));
  82. EXPECT(valid_bytes == 2);
  83. // Overlong 4-byte encoding of U+002F
  84. char invalid_utf8_3[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x80), static_cast<char>(0xaf) };
  85. Utf8View utf8_3 { StringView { invalid_utf8_3, sizeof(invalid_utf8_3) } };
  86. EXPECT(!utf8_3.validate(valid_bytes));
  87. EXPECT(valid_bytes == 2);
  88. // Overlong 3-byte encoding of U+00FF
  89. char invalid_utf8_4[] = { 42, 35, static_cast<char>(0xe0), static_cast<char>(0x83), static_cast<char>(0xbf) };
  90. Utf8View utf8_4 { StringView { invalid_utf8_4, sizeof(invalid_utf8_4) } };
  91. EXPECT(!utf8_4.validate(valid_bytes));
  92. EXPECT(valid_bytes == 2);
  93. // Overlong 4-byte encoding of U+00FF
  94. char invalid_utf8_5[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x80), static_cast<char>(0x83), static_cast<char>(0xbf) };
  95. Utf8View utf8_5 { StringView { invalid_utf8_5, sizeof(invalid_utf8_5) } };
  96. EXPECT(!utf8_5.validate(valid_bytes));
  97. EXPECT(valid_bytes == 2);
  98. // Overlong 4-byte encoding of U+0FFF
  99. char invalid_utf8_6[] = { 42, 35, static_cast<char>(0xf0), static_cast<char>(0x8f), static_cast<char>(0xbf), static_cast<char>(0xbf) };
  100. Utf8View utf8_6 { StringView { invalid_utf8_6, sizeof(invalid_utf8_6) } };
  101. EXPECT(!utf8_6.validate(valid_bytes));
  102. EXPECT(valid_bytes == 2);
  103. }
  104. TEST_CASE(iterate_utf8)
  105. {
  106. Utf8View view("Some weird characters \u00A9\u266A\uA755"sv);
  107. Utf8CodePointIterator iterator = view.begin();
  108. EXPECT(*iterator == 'S');
  109. EXPECT(iterator.peek().has_value() && iterator.peek().value() == 'S');
  110. EXPECT(iterator.peek(0).has_value() && iterator.peek(0).value() == 'S');
  111. EXPECT(iterator.peek(1).has_value() && iterator.peek(1).value() == 'o');
  112. EXPECT(iterator.peek(22).has_value() && iterator.peek(22).value() == 0x00A9);
  113. EXPECT(iterator.peek(24).has_value() && iterator.peek(24).value() == 0xA755);
  114. EXPECT(!iterator.peek(25).has_value());
  115. ++iterator;
  116. EXPECT(*iterator == 'o');
  117. EXPECT(iterator.peek(23).has_value() && iterator.peek(23).value() == 0xA755);
  118. for (size_t i = 0; i < 23; ++i)
  119. ++iterator;
  120. EXPECT(!iterator.done());
  121. EXPECT(*iterator == 0xA755);
  122. EXPECT(iterator.peek().has_value() && iterator.peek().value() == 0xA755);
  123. EXPECT(!iterator.peek(1).has_value());
  124. ++iterator;
  125. EXPECT(iterator.done());
  126. EXPECT(!iterator.peek(0).has_value());
  127. EXPECT_CRASH("Dereferencing Utf8CodePointIterator which is already done.", [&iterator] {
  128. *iterator;
  129. return Test::Crash::Failure::DidNotCrash;
  130. });
  131. }
  132. TEST_CASE(decode_invalid_ut8)
  133. {
  134. // Test case 1 : Getting an extension byte as first byte of the code point
  135. {
  136. char raw_data[] = { 'a', 'b', (char)0xA0, 'd' };
  137. Utf8View view { StringView { raw_data, 4 } };
  138. u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
  139. DeprecatedString expected_underlying_bytes[] = { "a", "b", "\xA0", "d" };
  140. size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
  141. size_t i = 0;
  142. for (auto it = view.begin(); it != view.end(); ++it) {
  143. u32 code_point = *it;
  144. VERIFY(i < expected_size);
  145. EXPECT_EQ(code_point, expected_characters[i]);
  146. EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
  147. i++;
  148. }
  149. VERIFY(i == expected_size);
  150. }
  151. // Test case 2 : Getting a non-extension byte when an extension byte is expected
  152. {
  153. char raw_data[] = { 'a', 'b', (char)0xC0, 'd', 'e' };
  154. Utf8View view { StringView { raw_data, 5 } };
  155. u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd', 'e' };
  156. DeprecatedString expected_underlying_bytes[] = { "a", "b", "\xC0", "d", "e" };
  157. size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
  158. size_t i = 0;
  159. for (auto it = view.begin(); it != view.end(); ++it) {
  160. u32 code_point = *it;
  161. VERIFY(i < expected_size);
  162. EXPECT_EQ(code_point, expected_characters[i]);
  163. EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
  164. i++;
  165. }
  166. VERIFY(i == expected_size);
  167. }
  168. // Test case 3 : Not enough bytes before the end of the string
  169. {
  170. char raw_data[] = { 'a', 'b', (char)0x90, 'd' };
  171. Utf8View view { StringView { raw_data, 4 } };
  172. u32 expected_characters[] = { 'a', 'b', 0xFFFD, 'd' };
  173. DeprecatedString expected_underlying_bytes[] = { "a", "b", "\x90", "d" };
  174. size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
  175. size_t i = 0;
  176. for (auto it = view.begin(); it != view.end(); ++it) {
  177. u32 code_point = *it;
  178. VERIFY(i < expected_size);
  179. EXPECT_EQ(code_point, expected_characters[i]);
  180. EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
  181. i++;
  182. }
  183. VERIFY(i == expected_size);
  184. }
  185. // Test case 4 : Not enough bytes at the end of the string
  186. {
  187. char raw_data[] = { 'a', 'b', 'c', (char)0x90 };
  188. Utf8View view { StringView { raw_data, 4 } };
  189. u32 expected_characters[] = { 'a', 'b', 'c', 0xFFFD };
  190. DeprecatedString expected_underlying_bytes[] = { "a", "b", "c", "\x90" };
  191. size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
  192. size_t i = 0;
  193. for (auto it = view.begin(); it != view.end(); ++it) {
  194. u32 code_point = *it;
  195. VERIFY(i < expected_size);
  196. EXPECT_EQ(code_point, expected_characters[i]);
  197. EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
  198. i++;
  199. }
  200. VERIFY(i == expected_size);
  201. }
  202. // Test case 5 : Oversized four-byte sequence (e.g. U+123456)
  203. {
  204. // Want to encode: (000)1 0010 0011 0100 0101 0110
  205. // Into mask: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
  206. // Shifted: 100 100011 010001 010110
  207. // Result: 11110100 10100011 10010001 10010110
  208. char raw_data[] = { 'a', (char)0xF4, (char)0xA3, (char)0x91, (char)0x96, 'b' };
  209. Utf8View view { StringView { raw_data, 6 } };
  210. // This definition seems to suggest that we should instead output multiple replacement characters:
  211. // https://encoding.spec.whatwg.org/#ref-for-concept-stream-prepend②
  212. // This is supported by the plaintext description and example collection, which annoyingly does not give an example of how to deal with this:
  213. // https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf , section "U+FFFD Substitution of Maximal Subparts"
  214. // However, that would go against how we deal with several other kinds of errors, so we stick to emitting only one U+FFFD.
  215. u32 expected_characters[] = { 'a', 0xFFFD, 'b' };
  216. DeprecatedString expected_underlying_bytes[] = { "a", "\xF4\xA3\x91\x96", "b" };
  217. size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
  218. size_t i = 0;
  219. for (auto it = view.begin(); it != view.end(); ++it) {
  220. u32 code_point = *it;
  221. VERIFY(i < expected_size);
  222. EXPECT_EQ(code_point, expected_characters[i]);
  223. EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
  224. i++;
  225. }
  226. VERIFY(i == expected_size);
  227. }
  228. }
  229. TEST_CASE(trim)
  230. {
  231. Utf8View whitespace { " "sv };
  232. {
  233. Utf8View view { "word"sv };
  234. EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
  235. EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
  236. EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
  237. }
  238. {
  239. Utf8View view { " word"sv };
  240. EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
  241. EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
  242. EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), " word");
  243. }
  244. {
  245. Utf8View view { "word "sv };
  246. EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
  247. EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word ");
  248. EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
  249. }
  250. {
  251. Utf8View view { " word "sv };
  252. EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
  253. EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word ");
  254. EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), " word");
  255. }
  256. {
  257. Utf8View view { "\u180E"sv };
  258. EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E");
  259. EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E");
  260. EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E");
  261. }
  262. }