TestUnicodeCharacterTypes.cpp 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <AK/StringView.h>
  8. #include <LibUnicode/CharacterTypes.h>
  9. #include <ctype.h>
  10. static void compare_to_ascii(auto& old_function, auto& new_function)
  11. {
  12. i64 result1 = 0;
  13. i64 result2 = 0;
  14. for (u32 i = 0; i < 0x80; ++i) {
  15. EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
  16. if (result1 != result2)
  17. dbgln("Function input value was {}.", i);
  18. }
  19. }
  20. TEST_CASE(to_unicode_lowercase)
  21. {
  22. compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
  23. EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
  24. EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
  25. // Code points encoded by ranges in UnicodeData.txt
  26. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
  27. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
  28. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
  29. EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
  30. }
  31. TEST_CASE(to_unicode_uppercase)
  32. {
  33. compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
  34. EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
  35. EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
  36. // Code points encoded by ranges in UnicodeData.txt
  37. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
  38. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
  39. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
  40. EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
  41. // Code points whose uppercase and titlecase mappings actually differ.
  42. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ"
  43. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ"
  44. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ"
  45. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ"
  46. }
  47. TEST_CASE(to_unicode_titlecase)
  48. {
  49. compare_to_ascii(toupper, Unicode::to_unicode_titlecase);
  50. EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω"
  51. EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
  52. // Code points encoded by ranges in UnicodeData.txt
  53. EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u);
  54. EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u);
  55. EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u);
  56. EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu);
  57. // Code points whose uppercase and titlecase mappings actually differ.
  58. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž"
  59. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
  60. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
  61. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
  62. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(""sv)), ""sv);
  63. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" "sv)), " "sv);
  64. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" - "sv)), " - "sv);
  65. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a"sv)), "A"sv);
  66. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("A"sv)), "A"sv);
  67. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" a"sv)), " A"sv);
  68. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("a "sv)), "A "sv);
  69. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab"sv)), "Ab"sv);
  70. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("Ab"sv)), "Ab"sv);
  71. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("aB"sv)), "Ab"sv);
  72. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("AB"sv)), "Ab"sv);
  73. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full(" ab"sv)), " Ab"sv);
  74. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("ab "sv)), "Ab "sv);
  75. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo bar baz"sv)), "Foo Bar Baz"sv);
  76. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("foo \n \r bar \t baz"sv)), "Foo \n \r Bar \t Baz"sv);
  77. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("f\"oo\" b'ar'"sv)), "F\"Oo\" B'ar'"sv);
  78. EXPECT_EQ(MUST(Unicode::to_unicode_titlecase_full("123dollars"sv)), "123Dollars"sv);
  79. }
  80. TEST_CASE(to_unicode_casefold)
  81. {
  82. for (u8 code_point = 0; code_point < 0x80; ++code_point) {
  83. auto ascii = tolower(code_point);
  84. auto unicode = MUST(Unicode::to_unicode_casefold_full({ reinterpret_cast<char const*>(&code_point), 1 }));
  85. EXPECT_EQ(unicode.bytes_as_string_view().length(), 1u);
  86. EXPECT_EQ(unicode.bytes_as_string_view()[0], ascii);
  87. }
  88. // LATIN SMALL LETTER SHARP S
  89. auto result = MUST(Unicode::to_unicode_casefold_full("\u00DF"sv));
  90. EXPECT_EQ(result, "\u0073\u0073"sv);
  91. // GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
  92. result = MUST(Unicode::to_unicode_casefold_full("\u1FB3"sv));
  93. EXPECT_EQ(result, "\u03B1\u03B9"sv);
  94. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI
  95. result = MUST(Unicode::to_unicode_casefold_full("\u1FB6"sv));
  96. EXPECT_EQ(result, "\u03B1\u0342"sv);
  97. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
  98. result = MUST(Unicode::to_unicode_casefold_full("\u1FB7"sv));
  99. EXPECT_EQ(result, "\u03B1\u0342\u03B9"sv);
  100. }
  101. TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
  102. {
  103. // LATIN SMALL LETTER SHARP S
  104. auto result = MUST(Unicode::to_unicode_lowercase_full("\u00DF"sv));
  105. EXPECT_EQ(result, "\u00DF");
  106. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  107. result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv));
  108. EXPECT_EQ(result, "\u0069\u0307");
  109. // LATIN SMALL LIGATURE FF
  110. result = MUST(Unicode::to_unicode_lowercase_full("\uFB00"sv));
  111. EXPECT_EQ(result, "\uFB00");
  112. // LATIN SMALL LIGATURE FI
  113. result = MUST(Unicode::to_unicode_lowercase_full("\uFB01"sv));
  114. EXPECT_EQ(result, "\uFB01");
  115. // LATIN SMALL LIGATURE FL
  116. result = MUST(Unicode::to_unicode_lowercase_full("\uFB02"sv));
  117. EXPECT_EQ(result, "\uFB02");
  118. // LATIN SMALL LIGATURE FFI
  119. result = MUST(Unicode::to_unicode_lowercase_full("\uFB03"sv));
  120. EXPECT_EQ(result, "\uFB03");
  121. // LATIN SMALL LIGATURE FFL
  122. result = MUST(Unicode::to_unicode_lowercase_full("\uFB04"sv));
  123. EXPECT_EQ(result, "\uFB04");
  124. // LATIN SMALL LIGATURE LONG S T
  125. result = MUST(Unicode::to_unicode_lowercase_full("\uFB05"sv));
  126. EXPECT_EQ(result, "\uFB05");
  127. // LATIN SMALL LIGATURE ST
  128. result = MUST(Unicode::to_unicode_lowercase_full("\uFB06"sv));
  129. EXPECT_EQ(result, "\uFB06");
  130. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
  131. result = MUST(Unicode::to_unicode_lowercase_full("\u1FB7"sv));
  132. EXPECT_EQ(result, "\u1FB7");
  133. // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
  134. result = MUST(Unicode::to_unicode_lowercase_full("\u1FC7"sv));
  135. EXPECT_EQ(result, "\u1FC7");
  136. // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
  137. result = MUST(Unicode::to_unicode_lowercase_full("\u1FF7"sv));
  138. EXPECT_EQ(result, "\u1FF7");
  139. }
  140. TEST_CASE(to_unicode_lowercase_special_casing_sigma)
  141. {
  142. auto result = MUST(Unicode::to_unicode_lowercase_full("ABCI"sv));
  143. EXPECT_EQ(result, "abci");
  144. // Sigma preceded by A
  145. result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3"sv));
  146. EXPECT_EQ(result, "a\u03C2");
  147. // Sigma preceded by FEMININE ORDINAL INDICATOR
  148. result = MUST(Unicode::to_unicode_lowercase_full("\u00AA\u03A3"sv));
  149. EXPECT_EQ(result, "\u00AA\u03C2");
  150. // Sigma preceded by ROMAN NUMERAL ONE
  151. result = MUST(Unicode::to_unicode_lowercase_full("\u2160\u03A3"sv));
  152. EXPECT_EQ(result, "\u2170\u03C2");
  153. // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI
  154. result = MUST(Unicode::to_unicode_lowercase_full("\u0345\u03A3"sv));
  155. EXPECT_EQ(result, "\u0345\u03C3");
  156. // Sigma preceded by A and FULL STOP
  157. result = MUST(Unicode::to_unicode_lowercase_full("A.\u03A3"sv));
  158. EXPECT_EQ(result, "a.\u03C2");
  159. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR
  160. result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv));
  161. EXPECT_EQ(result, "a\u180E\u03C2");
  162. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B
  163. result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv));
  164. EXPECT_EQ(result, "a\u180E\u03C3b");
  165. // Sigma followed by A
  166. result = MUST(Unicode::to_unicode_lowercase_full("\u03A3A"sv));
  167. EXPECT_EQ(result, "\u03C3a");
  168. // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR
  169. result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv));
  170. EXPECT_EQ(result, "a\u03C2\u180E");
  171. // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B
  172. result = MUST(Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv));
  173. EXPECT_EQ(result, "a\u03C3\u180Eb");
  174. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR
  175. result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv));
  176. EXPECT_EQ(result, "a\u180E\u03C2\u180E");
  177. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B
  178. result = MUST(Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv));
  179. EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
  180. }
  181. TEST_CASE(to_unicode_lowercase_special_casing_i)
  182. {
  183. // LATIN CAPITAL LETTER I
  184. auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv));
  185. EXPECT_EQ(result, "i"sv);
  186. result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv));
  187. EXPECT_EQ(result, "\u0131"sv);
  188. result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv));
  189. EXPECT_EQ(result, "\u0131"sv);
  190. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  191. result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv));
  192. EXPECT_EQ(result, "\u0069\u0307"sv);
  193. result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv));
  194. EXPECT_EQ(result, "i"sv);
  195. result = MUST(Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv));
  196. EXPECT_EQ(result, "i"sv);
  197. // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
  198. result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv));
  199. EXPECT_EQ(result, "i\u0307"sv);
  200. result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv));
  201. EXPECT_EQ(result, "i"sv);
  202. result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv));
  203. EXPECT_EQ(result, "i"sv);
  204. // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
  205. result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv));
  206. EXPECT_EQ(result, "ia\u0307"sv);
  207. result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv));
  208. EXPECT_EQ(result, "\u0131a\u0307"sv);
  209. result = MUST(Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv));
  210. EXPECT_EQ(result, "\u0131a\u0307"sv);
  211. }
  212. TEST_CASE(to_unicode_lowercase_special_casing_more_above)
  213. {
  214. // LATIN CAPITAL LETTER I
  215. auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv));
  216. EXPECT_EQ(result, "i"sv);
  217. result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "lt"sv));
  218. EXPECT_EQ(result, "i"sv);
  219. // LATIN CAPITAL LETTER J
  220. result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "en"sv));
  221. EXPECT_EQ(result, "j"sv);
  222. result = MUST(Unicode::to_unicode_lowercase_full("J"sv, "lt"sv));
  223. EXPECT_EQ(result, "j"sv);
  224. // LATIN CAPITAL LETTER I WITH OGONEK
  225. result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "en"sv));
  226. EXPECT_EQ(result, "\u012f"sv);
  227. result = MUST(Unicode::to_unicode_lowercase_full("\u012e"sv, "lt"sv));
  228. EXPECT_EQ(result, "\u012f"sv);
  229. // LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT
  230. result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "en"sv));
  231. EXPECT_EQ(result, "i\u0300"sv);
  232. result = MUST(Unicode::to_unicode_lowercase_full("I\u0300"sv, "lt"sv));
  233. EXPECT_EQ(result, "i\u0307\u0300"sv);
  234. // LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT
  235. result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "en"sv));
  236. EXPECT_EQ(result, "j\u0300"sv);
  237. result = MUST(Unicode::to_unicode_lowercase_full("J\u0300"sv, "lt"sv));
  238. EXPECT_EQ(result, "j\u0307\u0300"sv);
  239. // LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT
  240. result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "en"sv));
  241. EXPECT_EQ(result, "\u012f\u0300"sv);
  242. result = MUST(Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "lt"sv));
  243. EXPECT_EQ(result, "\u012f\u0307\u0300"sv);
  244. }
  245. TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot)
  246. {
  247. // LATIN CAPITAL LETTER I
  248. auto result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "en"sv));
  249. EXPECT_EQ(result, "i"sv);
  250. result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "az"sv));
  251. EXPECT_EQ(result, "\u0131"sv);
  252. result = MUST(Unicode::to_unicode_lowercase_full("I"sv, "tr"sv));
  253. EXPECT_EQ(result, "\u0131"sv);
  254. // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
  255. result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv));
  256. EXPECT_EQ(result, "i\u0307"sv);
  257. result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv));
  258. EXPECT_EQ(result, "i"sv);
  259. result = MUST(Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv));
  260. EXPECT_EQ(result, "i"sv);
  261. }
  262. TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
  263. {
  264. // LATIN SMALL LETTER SHARP S
  265. auto result = MUST(Unicode::to_unicode_uppercase_full("\u00DF"sv));
  266. EXPECT_EQ(result, "\u0053\u0053");
  267. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  268. result = MUST(Unicode::to_unicode_uppercase_full("\u0130"sv));
  269. EXPECT_EQ(result, "\u0130");
  270. // LATIN SMALL LIGATURE FF
  271. result = MUST(Unicode::to_unicode_uppercase_full("\uFB00"sv));
  272. EXPECT_EQ(result, "\u0046\u0046");
  273. // LATIN SMALL LIGATURE FI
  274. result = MUST(Unicode::to_unicode_uppercase_full("\uFB01"sv));
  275. EXPECT_EQ(result, "\u0046\u0049");
  276. // LATIN SMALL LIGATURE FL
  277. result = MUST(Unicode::to_unicode_uppercase_full("\uFB02"sv));
  278. EXPECT_EQ(result, "\u0046\u004C");
  279. // LATIN SMALL LIGATURE FFI
  280. result = MUST(Unicode::to_unicode_uppercase_full("\uFB03"sv));
  281. EXPECT_EQ(result, "\u0046\u0046\u0049");
  282. // LATIN SMALL LIGATURE FFL
  283. result = MUST(Unicode::to_unicode_uppercase_full("\uFB04"sv));
  284. EXPECT_EQ(result, "\u0046\u0046\u004C");
  285. // LATIN SMALL LIGATURE LONG S T
  286. result = MUST(Unicode::to_unicode_uppercase_full("\uFB05"sv));
  287. EXPECT_EQ(result, "\u0053\u0054");
  288. // LATIN SMALL LIGATURE ST
  289. result = MUST(Unicode::to_unicode_uppercase_full("\uFB06"sv));
  290. EXPECT_EQ(result, "\u0053\u0054");
  291. // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
  292. result = MUST(Unicode::to_unicode_uppercase_full("\u0390"sv));
  293. EXPECT_EQ(result, "\u0399\u0308\u0301");
  294. // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
  295. result = MUST(Unicode::to_unicode_uppercase_full("\u03B0"sv));
  296. EXPECT_EQ(result, "\u03A5\u0308\u0301");
  297. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
  298. result = MUST(Unicode::to_unicode_uppercase_full("\u1FB7"sv));
  299. EXPECT_EQ(result, "\u0391\u0342\u0399");
  300. // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
  301. result = MUST(Unicode::to_unicode_uppercase_full("\u1FC7"sv));
  302. EXPECT_EQ(result, "\u0397\u0342\u0399");
  303. // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
  304. result = MUST(Unicode::to_unicode_uppercase_full("\u1FF7"sv));
  305. EXPECT_EQ(result, "\u03A9\u0342\u0399");
  306. }
  307. TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted)
  308. {
  309. // LATIN SMALL LETTER I
  310. auto result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "en"sv));
  311. EXPECT_EQ(result, "I"sv);
  312. result = MUST(Unicode::to_unicode_uppercase_full("i"sv, "lt"sv));
  313. EXPECT_EQ(result, "I"sv);
  314. // LATIN SMALL LETTER J
  315. result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "en"sv));
  316. EXPECT_EQ(result, "J"sv);
  317. result = MUST(Unicode::to_unicode_uppercase_full("j"sv, "lt"sv));
  318. EXPECT_EQ(result, "J"sv);
  319. // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE
  320. result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv));
  321. EXPECT_EQ(result, "I\u0307"sv);
  322. result = MUST(Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv));
  323. EXPECT_EQ(result, "I"sv);
  324. // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE
  325. result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv));
  326. EXPECT_EQ(result, "J\u0307"sv);
  327. result = MUST(Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv));
  328. EXPECT_EQ(result, "J"sv);
  329. }
  330. TEST_CASE(to_unicode_titlecase_unconditional_special_casing)
  331. {
  332. // LATIN SMALL LETTER SHARP S
  333. auto result = MUST(Unicode::to_unicode_titlecase_full("\u00DF"sv));
  334. EXPECT_EQ(result, "\u0053\u0073"sv);
  335. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  336. result = MUST(Unicode::to_unicode_titlecase_full("\u0130"sv));
  337. EXPECT_EQ(result, "\u0130"sv);
  338. // LATIN SMALL LIGATURE FF
  339. result = MUST(Unicode::to_unicode_titlecase_full("\uFB00"sv));
  340. EXPECT_EQ(result, "\u0046\u0066"sv);
  341. // LATIN SMALL LIGATURE FI
  342. result = MUST(Unicode::to_unicode_titlecase_full("\uFB01"sv));
  343. EXPECT_EQ(result, "\u0046\u0069"sv);
  344. // LATIN SMALL LIGATURE FL
  345. result = MUST(Unicode::to_unicode_titlecase_full("\uFB02"sv));
  346. EXPECT_EQ(result, "\u0046\u006C"sv);
  347. // LATIN SMALL LIGATURE FFI
  348. result = MUST(Unicode::to_unicode_titlecase_full("\uFB03"sv));
  349. EXPECT_EQ(result, "\u0046\u0066\u0069"sv);
  350. // LATIN SMALL LIGATURE FFL
  351. result = MUST(Unicode::to_unicode_titlecase_full("\uFB04"sv));
  352. EXPECT_EQ(result, "\u0046\u0066\u006C"sv);
  353. // LATIN SMALL LIGATURE LONG S T
  354. result = MUST(Unicode::to_unicode_titlecase_full("\uFB05"sv));
  355. EXPECT_EQ(result, "\u0053\u0074"sv);
  356. // LATIN SMALL LIGATURE ST
  357. result = MUST(Unicode::to_unicode_titlecase_full("\uFB06"sv));
  358. EXPECT_EQ(result, "\u0053\u0074"sv);
  359. // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
  360. result = MUST(Unicode::to_unicode_titlecase_full("\u0390"sv));
  361. EXPECT_EQ(result, "\u0399\u0308\u0301"sv);
  362. // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
  363. result = MUST(Unicode::to_unicode_titlecase_full("\u03B0"sv));
  364. EXPECT_EQ(result, "\u03A5\u0308\u0301"sv);
  365. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
  366. result = MUST(Unicode::to_unicode_titlecase_full("\u1FB7"sv));
  367. EXPECT_EQ(result, "\u0391\u0342\u0345"sv);
  368. // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
  369. result = MUST(Unicode::to_unicode_titlecase_full("\u1FC7"sv));
  370. EXPECT_EQ(result, "\u0397\u0342\u0345"sv);
  371. // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
  372. result = MUST(Unicode::to_unicode_titlecase_full("\u1FF7"sv));
  373. EXPECT_EQ(result, "\u03A9\u0342\u0345"sv);
  374. }
  375. TEST_CASE(to_unicode_titlecase_special_casing_i)
  376. {
  377. // LATIN SMALL LETTER I
  378. auto result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "en"sv));
  379. EXPECT_EQ(result, "I"sv);
  380. result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "az"sv));
  381. EXPECT_EQ(result, "\u0130"sv);
  382. result = MUST(Unicode::to_unicode_titlecase_full("i"sv, "tr"sv));
  383. EXPECT_EQ(result, "\u0130"sv);
  384. }
  385. TEST_CASE(general_category)
  386. {
  387. auto general_category = [](StringView name) {
  388. auto general_category = Unicode::general_category_from_string(name);
  389. VERIFY(general_category.has_value());
  390. return *general_category;
  391. };
  392. auto general_category_c = general_category("C"sv);
  393. auto general_category_other = general_category("Other"sv);
  394. EXPECT_EQ(general_category_c, general_category_other);
  395. auto general_category_cc = general_category("Cc"sv);
  396. auto general_category_control = general_category("Control"sv);
  397. EXPECT_EQ(general_category_cc, general_category_control);
  398. auto general_category_co = general_category("Co"sv);
  399. auto general_category_private_use = general_category("Private_Use"sv);
  400. EXPECT_EQ(general_category_co, general_category_private_use);
  401. auto general_category_cn = general_category("Cn"sv);
  402. auto general_category_unassigned = general_category("Unassigned"sv);
  403. EXPECT_EQ(general_category_cn, general_category_unassigned);
  404. auto general_category_lc = general_category("LC"sv);
  405. auto general_category_cased_letter = general_category("Cased_Letter"sv);
  406. EXPECT_EQ(general_category_lc, general_category_cased_letter);
  407. auto general_category_ll = general_category("Ll"sv);
  408. auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv);
  409. EXPECT_EQ(general_category_ll, general_category_lowercase_letter);
  410. auto general_category_lu = general_category("Lu"sv);
  411. auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv);
  412. EXPECT_EQ(general_category_lu, general_category_uppercase_letter);
  413. for (u32 code_point = 0; code_point <= 0x1f; ++code_point) {
  414. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  415. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc));
  416. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  417. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  418. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  419. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  420. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  421. }
  422. for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) {
  423. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  424. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co));
  425. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  426. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  427. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  428. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  429. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  430. }
  431. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  432. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  433. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cn));
  434. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  435. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  436. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  437. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  438. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  439. }
  440. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  441. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  442. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll));
  443. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  444. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  445. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  446. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  447. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  448. }
  449. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  450. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  451. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu));
  452. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  453. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  454. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  455. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  456. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  457. }
  458. }
  459. TEST_CASE(property)
  460. {
  461. auto property = [](StringView name) {
  462. auto property = Unicode::property_from_string(name);
  463. VERIFY(property.has_value());
  464. return *property;
  465. };
  466. auto property_any = property("Any"sv);
  467. auto property_assigned = property("Assigned"sv);
  468. auto property_ascii = property("ASCII"sv);
  469. auto property_white_space = property("White_Space"sv);
  470. auto property_wspace = property("WSpace"sv);
  471. auto property_space = property("space"sv);
  472. EXPECT_EQ(property_white_space, property_wspace);
  473. EXPECT_EQ(property_white_space, property_space);
  474. auto property_emoji_presentation = property("Emoji_Presentation"sv);
  475. auto property_epres = property("EPres"sv);
  476. EXPECT_EQ(property_emoji_presentation, property_epres);
  477. for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000)
  478. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  479. for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) {
  480. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  481. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  482. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  483. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  484. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  485. }
  486. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  487. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  488. EXPECT(!Unicode::code_point_has_property(code_point, property_assigned));
  489. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  490. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  491. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  492. }
  493. for (u32 code_point = 0; code_point <= 0x7f; ++code_point) {
  494. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  495. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  496. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  497. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  498. }
  499. for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) {
  500. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  501. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  502. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  503. EXPECT(Unicode::code_point_has_property(code_point, property_white_space));
  504. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  505. }
  506. for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) {
  507. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  508. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  509. EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation));
  510. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  511. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  512. }
  513. }
  514. TEST_CASE(script)
  515. {
  516. auto script = [](StringView name) {
  517. auto script = Unicode::script_from_string(name);
  518. VERIFY(script.has_value());
  519. return *script;
  520. };
  521. auto script_latin = script("Latin"sv);
  522. auto script_latn = script("Latn"sv);
  523. EXPECT_EQ(script_latin, script_latn);
  524. auto script_cyrillic = script("Cyrillic"sv);
  525. auto script_cyrl = script("Cyrl"sv);
  526. EXPECT_EQ(script_cyrillic, script_cyrl);
  527. auto script_greek = script("Greek"sv);
  528. auto script_grek = script("Grek"sv);
  529. EXPECT_EQ(script_greek, script_grek);
  530. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  531. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  532. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  533. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  534. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  535. }
  536. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  537. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  538. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  539. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  540. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  541. }
  542. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  543. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  544. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  545. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  546. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  547. }
  548. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  549. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  550. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  551. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  552. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  553. }
  554. for (u32 code_point = 0x1f80; code_point <= 0x1fb4; ++code_point) {
  555. EXPECT(Unicode::code_point_has_script(code_point, script_greek));
  556. EXPECT(Unicode::code_point_has_script_extension(code_point, script_greek));
  557. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  558. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  559. }
  560. }
  561. TEST_CASE(block)
  562. {
  563. for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point)
  564. EXPECT_EQ("Basic Latin"sv, Unicode::code_point_block_display_name(code_point).value());
  565. for (u32 code_point = 0x0370; code_point <= 0x03FF; ++code_point)
  566. EXPECT_EQ("Greek and Coptic"sv, Unicode::code_point_block_display_name(code_point).value());
  567. }
  568. TEST_CASE(script_extension)
  569. {
  570. auto script = [](StringView name) {
  571. auto script = Unicode::script_from_string(name);
  572. VERIFY(script.has_value());
  573. return *script;
  574. };
  575. auto script_latin = script("Latin"sv);
  576. auto script_greek = script("Greek"sv);
  577. for (u32 code_point = 0x363; code_point <= 0x36f; ++code_point) {
  578. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  579. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  580. }
  581. EXPECT(!Unicode::code_point_has_script(0x342, script_greek));
  582. EXPECT(Unicode::code_point_has_script_extension(0x342, script_greek));
  583. EXPECT(!Unicode::code_point_has_script(0x345, script_greek));
  584. EXPECT(Unicode::code_point_has_script_extension(0x345, script_greek));
  585. EXPECT(!Unicode::code_point_has_script(0x1dc0, script_greek));
  586. EXPECT(Unicode::code_point_has_script_extension(0x1dc0, script_greek));
  587. EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek));
  588. EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek));
  589. auto script_common = script("Common"sv);
  590. auto script_zyyy = script("Zyyy"sv);
  591. EXPECT_EQ(script_common, script_zyyy);
  592. EXPECT(Unicode::code_point_has_script(0x202f, script_common));
  593. EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common));
  594. EXPECT(Unicode::code_point_has_script(0x3000, script_common));
  595. EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common));
  596. auto script_inherited = script("Inherited"sv);
  597. auto script_qaai = script("Qaai"sv);
  598. auto script_zinh = script("Zinh"sv);
  599. EXPECT_EQ(script_inherited, script_qaai);
  600. EXPECT_EQ(script_inherited, script_zinh);
  601. EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited));
  602. EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited));
  603. EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
  604. EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
  605. }
  606. TEST_CASE(code_point_display_name)
  607. {
  608. auto code_point_display_name = [](u32 code_point) {
  609. auto name = Unicode::code_point_display_name(code_point);
  610. VERIFY(name.has_value());
  611. return name.release_value();
  612. };
  613. // Control code points.
  614. EXPECT_EQ(code_point_display_name(0), "NULL"sv);
  615. EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv);
  616. EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv);
  617. // Ideographic code points (which already appeared in a range in UnicodeData.txt).
  618. EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv);
  619. EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv);
  620. EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv);
  621. EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv);
  622. EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv);
  623. EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv);
  624. EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv);
  625. EXPECT_EQ(code_point_display_name(0x2a6df), "CJK UNIFIED IDEOGRAPH-2A6DF"sv);
  626. EXPECT(!Unicode::code_point_display_name(0x2a6e0).has_value());
  627. // Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range).
  628. EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv);
  629. EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv);
  630. EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
  631. EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
  632. }