TestUnicodeCharacterTypes.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <AK/StringView.h>
  8. #include <LibUnicode/CharacterTypes.h>
  9. #include <ctype.h>
  10. static void compare_to_ascii(auto& old_function, auto& new_function)
  11. {
  12. i64 result1 = 0;
  13. i64 result2 = 0;
  14. for (u32 i = 0; i < 0x80; ++i) {
  15. EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
  16. if (result1 != result2)
  17. dbgln("Function input value was {}.", i);
  18. }
  19. }
  20. TEST_CASE(to_unicode_lowercase)
  21. {
  22. compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
  23. EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
  24. EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
  25. // Code points encoded by ranges in UnicodeData.txt
  26. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
  27. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
  28. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
  29. EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
  30. }
  31. TEST_CASE(to_unicode_uppercase)
  32. {
  33. compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
  34. EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
  35. EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
  36. // Code points encoded by ranges in UnicodeData.txt
  37. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
  38. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
  39. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
  40. EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
  41. }
  42. TEST_CASE(to_unicode_lowercase_unconditional_special_casing)
  43. {
  44. // LATIN SMALL LETTER SHARP S
  45. auto result = Unicode::to_unicode_lowercase_full("\u00DF"sv);
  46. EXPECT_EQ(result, "\u00DF");
  47. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  48. result = Unicode::to_unicode_lowercase_full("\u0130"sv);
  49. EXPECT_EQ(result, "\u0069\u0307");
  50. // LATIN SMALL LIGATURE FF
  51. result = Unicode::to_unicode_lowercase_full("\uFB00"sv);
  52. EXPECT_EQ(result, "\uFB00");
  53. // LATIN SMALL LIGATURE FI
  54. result = Unicode::to_unicode_lowercase_full("\uFB01"sv);
  55. EXPECT_EQ(result, "\uFB01");
  56. // LATIN SMALL LIGATURE FL
  57. result = Unicode::to_unicode_lowercase_full("\uFB02"sv);
  58. EXPECT_EQ(result, "\uFB02");
  59. // LATIN SMALL LIGATURE FFI
  60. result = Unicode::to_unicode_lowercase_full("\uFB03"sv);
  61. EXPECT_EQ(result, "\uFB03");
  62. // LATIN SMALL LIGATURE FFL
  63. result = Unicode::to_unicode_lowercase_full("\uFB04"sv);
  64. EXPECT_EQ(result, "\uFB04");
  65. // LATIN SMALL LIGATURE LONG S T
  66. result = Unicode::to_unicode_lowercase_full("\uFB05"sv);
  67. EXPECT_EQ(result, "\uFB05");
  68. // LATIN SMALL LIGATURE ST
  69. result = Unicode::to_unicode_lowercase_full("\uFB06"sv);
  70. EXPECT_EQ(result, "\uFB06");
  71. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
  72. result = Unicode::to_unicode_lowercase_full("\u1FB7"sv);
  73. EXPECT_EQ(result, "\u1FB7");
  74. // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
  75. result = Unicode::to_unicode_lowercase_full("\u1FC7"sv);
  76. EXPECT_EQ(result, "\u1FC7");
  77. // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
  78. result = Unicode::to_unicode_lowercase_full("\u1FF7"sv);
  79. EXPECT_EQ(result, "\u1FF7");
  80. }
  81. TEST_CASE(to_unicode_lowercase_special_casing_sigma)
  82. {
  83. auto result = Unicode::to_unicode_lowercase_full("ABCI"sv);
  84. EXPECT_EQ(result, "abci");
  85. // Sigma preceded by A
  86. result = Unicode::to_unicode_lowercase_full("A\u03A3"sv);
  87. EXPECT_EQ(result, "a\u03C2");
  88. // Sigma preceded by FEMININE ORDINAL INDICATOR
  89. result = Unicode::to_unicode_lowercase_full("\u00AA\u03A3"sv);
  90. EXPECT_EQ(result, "\u00AA\u03C2");
  91. // Sigma preceded by ROMAN NUMERAL ONE
  92. result = Unicode::to_unicode_lowercase_full("\u2160\u03A3"sv);
  93. EXPECT_EQ(result, "\u2170\u03C2");
  94. // Sigma preceded by COMBINING GREEK YPOGEGRAMMENI
  95. result = Unicode::to_unicode_lowercase_full("\u0345\u03A3"sv);
  96. EXPECT_EQ(result, "\u0345\u03C3");
  97. // Sigma preceded by A and FULL STOP
  98. result = Unicode::to_unicode_lowercase_full("A.\u03A3"sv);
  99. EXPECT_EQ(result, "a.\u03C2");
  100. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR
  101. result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3"sv);
  102. EXPECT_EQ(result, "a\u180E\u03C2");
  103. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B
  104. result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3B"sv);
  105. EXPECT_EQ(result, "a\u180E\u03C3b");
  106. // Sigma followed by A
  107. result = Unicode::to_unicode_lowercase_full("\u03A3A"sv);
  108. EXPECT_EQ(result, "\u03C3a");
  109. // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR
  110. result = Unicode::to_unicode_lowercase_full("A\u03A3\u180E"sv);
  111. EXPECT_EQ(result, "a\u03C2\u180E");
  112. // Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B
  113. result = Unicode::to_unicode_lowercase_full("A\u03A3\u180EB"sv);
  114. EXPECT_EQ(result, "a\u03C3\u180Eb");
  115. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR
  116. result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180E"sv);
  117. EXPECT_EQ(result, "a\u180E\u03C2\u180E");
  118. // Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B
  119. result = Unicode::to_unicode_lowercase_full("A\u180E\u03A3\u180EB"sv);
  120. EXPECT_EQ(result, "a\u180E\u03C3\u180Eb");
  121. }
  122. TEST_CASE(to_unicode_lowercase_special_casing_i)
  123. {
  124. // LATIN CAPITAL LETTER I
  125. auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
  126. EXPECT_EQ(result, "i"sv);
  127. result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
  128. EXPECT_EQ(result, "\u0131"sv);
  129. result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
  130. EXPECT_EQ(result, "\u0131"sv);
  131. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  132. result = Unicode::to_unicode_lowercase_full("\u0130"sv, "en"sv);
  133. EXPECT_EQ(result, "\u0069\u0307"sv);
  134. result = Unicode::to_unicode_lowercase_full("\u0130"sv, "az"sv);
  135. EXPECT_EQ(result, "i"sv);
  136. result = Unicode::to_unicode_lowercase_full("\u0130"sv, "tr"sv);
  137. EXPECT_EQ(result, "i"sv);
  138. // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
  139. result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
  140. EXPECT_EQ(result, "i\u0307"sv);
  141. result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
  142. EXPECT_EQ(result, "i"sv);
  143. result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
  144. EXPECT_EQ(result, "i"sv);
  145. // LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
  146. result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "en"sv);
  147. EXPECT_EQ(result, "ia\u0307"sv);
  148. result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "az"sv);
  149. EXPECT_EQ(result, "\u0131a\u0307"sv);
  150. result = Unicode::to_unicode_lowercase_full("IA\u0307"sv, "tr"sv);
  151. EXPECT_EQ(result, "\u0131a\u0307"sv);
  152. }
  153. TEST_CASE(to_unicode_lowercase_special_casing_more_above)
  154. {
  155. // LATIN CAPITAL LETTER I
  156. auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
  157. EXPECT_EQ(result, "i"sv);
  158. result = Unicode::to_unicode_lowercase_full("I"sv, "lt"sv);
  159. EXPECT_EQ(result, "i"sv);
  160. // LATIN CAPITAL LETTER J
  161. result = Unicode::to_unicode_lowercase_full("J"sv, "en"sv);
  162. EXPECT_EQ(result, "j"sv);
  163. result = Unicode::to_unicode_lowercase_full("J"sv, "lt"sv);
  164. EXPECT_EQ(result, "j"sv);
  165. // LATIN CAPITAL LETTER I WITH OGONEK
  166. result = Unicode::to_unicode_lowercase_full("\u012e"sv, "en"sv);
  167. EXPECT_EQ(result, "\u012f"sv);
  168. result = Unicode::to_unicode_lowercase_full("\u012e"sv, "lt"sv);
  169. EXPECT_EQ(result, "\u012f"sv);
  170. // LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT
  171. result = Unicode::to_unicode_lowercase_full("I\u0300"sv, "en"sv);
  172. EXPECT_EQ(result, "i\u0300"sv);
  173. result = Unicode::to_unicode_lowercase_full("I\u0300"sv, "lt"sv);
  174. EXPECT_EQ(result, "i\u0307\u0300"sv);
  175. // LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT
  176. result = Unicode::to_unicode_lowercase_full("J\u0300"sv, "en"sv);
  177. EXPECT_EQ(result, "j\u0300"sv);
  178. result = Unicode::to_unicode_lowercase_full("J\u0300"sv, "lt"sv);
  179. EXPECT_EQ(result, "j\u0307\u0300"sv);
  180. // LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT
  181. result = Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "en"sv);
  182. EXPECT_EQ(result, "\u012f\u0300"sv);
  183. result = Unicode::to_unicode_lowercase_full("\u012e\u0300"sv, "lt"sv);
  184. EXPECT_EQ(result, "\u012f\u0307\u0300"sv);
  185. }
  186. TEST_CASE(to_unicode_lowercase_special_casing_not_before_dot)
  187. {
  188. // LATIN CAPITAL LETTER I
  189. auto result = Unicode::to_unicode_lowercase_full("I"sv, "en"sv);
  190. EXPECT_EQ(result, "i"sv);
  191. result = Unicode::to_unicode_lowercase_full("I"sv, "az"sv);
  192. EXPECT_EQ(result, "\u0131"sv);
  193. result = Unicode::to_unicode_lowercase_full("I"sv, "tr"sv);
  194. EXPECT_EQ(result, "\u0131"sv);
  195. // LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
  196. result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "en"sv);
  197. EXPECT_EQ(result, "i\u0307"sv);
  198. result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "az"sv);
  199. EXPECT_EQ(result, "i"sv);
  200. result = Unicode::to_unicode_lowercase_full("I\u0307"sv, "tr"sv);
  201. EXPECT_EQ(result, "i"sv);
  202. }
  203. TEST_CASE(to_unicode_uppercase_unconditional_special_casing)
  204. {
  205. // LATIN SMALL LETTER SHARP S
  206. auto result = Unicode::to_unicode_uppercase_full("\u00DF"sv);
  207. EXPECT_EQ(result, "\u0053\u0053");
  208. // LATIN CAPITAL LETTER I WITH DOT ABOVE
  209. result = Unicode::to_unicode_uppercase_full("\u0130"sv);
  210. EXPECT_EQ(result, "\u0130");
  211. // LATIN SMALL LIGATURE FF
  212. result = Unicode::to_unicode_uppercase_full("\uFB00"sv);
  213. EXPECT_EQ(result, "\u0046\u0046");
  214. // LATIN SMALL LIGATURE FI
  215. result = Unicode::to_unicode_uppercase_full("\uFB01"sv);
  216. EXPECT_EQ(result, "\u0046\u0049");
  217. // LATIN SMALL LIGATURE FL
  218. result = Unicode::to_unicode_uppercase_full("\uFB02"sv);
  219. EXPECT_EQ(result, "\u0046\u004C");
  220. // LATIN SMALL LIGATURE FFI
  221. result = Unicode::to_unicode_uppercase_full("\uFB03"sv);
  222. EXPECT_EQ(result, "\u0046\u0046\u0049");
  223. // LATIN SMALL LIGATURE FFL
  224. result = Unicode::to_unicode_uppercase_full("\uFB04"sv);
  225. EXPECT_EQ(result, "\u0046\u0046\u004C");
  226. // LATIN SMALL LIGATURE LONG S T
  227. result = Unicode::to_unicode_uppercase_full("\uFB05"sv);
  228. EXPECT_EQ(result, "\u0053\u0054");
  229. // LATIN SMALL LIGATURE ST
  230. result = Unicode::to_unicode_uppercase_full("\uFB06"sv);
  231. EXPECT_EQ(result, "\u0053\u0054");
  232. // GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
  233. result = Unicode::to_unicode_uppercase_full("\u0390"sv);
  234. EXPECT_EQ(result, "\u0399\u0308\u0301");
  235. // GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
  236. result = Unicode::to_unicode_uppercase_full("\u03B0"sv);
  237. EXPECT_EQ(result, "\u03A5\u0308\u0301");
  238. // GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
  239. result = Unicode::to_unicode_uppercase_full("\u1FB7"sv);
  240. EXPECT_EQ(result, "\u0391\u0342\u0399");
  241. // GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
  242. result = Unicode::to_unicode_uppercase_full("\u1FC7"sv);
  243. EXPECT_EQ(result, "\u0397\u0342\u0399");
  244. // GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
  245. result = Unicode::to_unicode_uppercase_full("\u1FF7"sv);
  246. EXPECT_EQ(result, "\u03A9\u0342\u0399");
  247. }
  248. TEST_CASE(to_unicode_uppercase_special_casing_soft_dotted)
  249. {
  250. // LATIN SMALL LETTER I
  251. auto result = Unicode::to_unicode_uppercase_full("i"sv, "en"sv);
  252. EXPECT_EQ(result, "I"sv);
  253. result = Unicode::to_unicode_uppercase_full("i"sv, "lt"sv);
  254. EXPECT_EQ(result, "I"sv);
  255. // LATIN SMALL LETTER J
  256. result = Unicode::to_unicode_uppercase_full("j"sv, "en"sv);
  257. EXPECT_EQ(result, "J"sv);
  258. result = Unicode::to_unicode_uppercase_full("j"sv, "lt"sv);
  259. EXPECT_EQ(result, "J"sv);
  260. // LATIN SMALL LETTER I followed by COMBINING DOT ABOVE
  261. result = Unicode::to_unicode_uppercase_full("i\u0307"sv, "en"sv);
  262. EXPECT_EQ(result, "I\u0307"sv);
  263. result = Unicode::to_unicode_uppercase_full("i\u0307"sv, "lt"sv);
  264. EXPECT_EQ(result, "I"sv);
  265. // LATIN SMALL LETTER J followed by COMBINING DOT ABOVE
  266. result = Unicode::to_unicode_uppercase_full("j\u0307"sv, "en"sv);
  267. EXPECT_EQ(result, "J\u0307"sv);
  268. result = Unicode::to_unicode_uppercase_full("j\u0307"sv, "lt"sv);
  269. EXPECT_EQ(result, "J"sv);
  270. }
  271. TEST_CASE(general_category)
  272. {
  273. auto general_category = [](StringView name) {
  274. auto general_category = Unicode::general_category_from_string(name);
  275. VERIFY(general_category.has_value());
  276. return *general_category;
  277. };
  278. auto general_category_c = general_category("C"sv);
  279. auto general_category_other = general_category("Other"sv);
  280. EXPECT_EQ(general_category_c, general_category_other);
  281. auto general_category_cc = general_category("Cc"sv);
  282. auto general_category_control = general_category("Control"sv);
  283. EXPECT_EQ(general_category_cc, general_category_control);
  284. auto general_category_co = general_category("Co"sv);
  285. auto general_category_private_use = general_category("Private_Use"sv);
  286. EXPECT_EQ(general_category_co, general_category_private_use);
  287. auto general_category_cn = general_category("Cn"sv);
  288. auto general_category_unassigned = general_category("Unassigned"sv);
  289. EXPECT_EQ(general_category_cn, general_category_unassigned);
  290. auto general_category_lc = general_category("LC"sv);
  291. auto general_category_cased_letter = general_category("Cased_Letter"sv);
  292. EXPECT_EQ(general_category_lc, general_category_cased_letter);
  293. auto general_category_ll = general_category("Ll"sv);
  294. auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv);
  295. EXPECT_EQ(general_category_ll, general_category_lowercase_letter);
  296. auto general_category_lu = general_category("Lu"sv);
  297. auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv);
  298. EXPECT_EQ(general_category_lu, general_category_uppercase_letter);
  299. for (u32 code_point = 0; code_point <= 0x1f; ++code_point) {
  300. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  301. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc));
  302. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  303. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  304. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  305. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  306. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  307. }
  308. for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) {
  309. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  310. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co));
  311. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  312. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  313. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  314. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  315. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  316. }
  317. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  318. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  319. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cn));
  320. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  321. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  322. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  323. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  324. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  325. }
  326. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  327. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  328. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll));
  329. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  330. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  331. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  332. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  333. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  334. }
  335. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  336. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  337. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu));
  338. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  339. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  340. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  341. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  342. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  343. }
  344. }
  345. TEST_CASE(property)
  346. {
  347. auto property = [](StringView name) {
  348. auto property = Unicode::property_from_string(name);
  349. VERIFY(property.has_value());
  350. return *property;
  351. };
  352. auto property_any = property("Any"sv);
  353. auto property_assigned = property("Assigned"sv);
  354. auto property_ascii = property("ASCII"sv);
  355. auto property_white_space = property("White_Space"sv);
  356. auto property_wspace = property("WSpace"sv);
  357. auto property_space = property("space"sv);
  358. EXPECT_EQ(property_white_space, property_wspace);
  359. EXPECT_EQ(property_white_space, property_space);
  360. auto property_emoji_presentation = property("Emoji_Presentation"sv);
  361. auto property_epres = property("EPres"sv);
  362. EXPECT_EQ(property_emoji_presentation, property_epres);
  363. for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000)
  364. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  365. for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) {
  366. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  367. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  368. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  369. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  370. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  371. }
  372. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  373. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  374. EXPECT(!Unicode::code_point_has_property(code_point, property_assigned));
  375. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  376. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  377. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  378. }
  379. for (u32 code_point = 0; code_point <= 0x7f; ++code_point) {
  380. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  381. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  382. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  383. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  384. }
  385. for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) {
  386. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  387. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  388. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  389. EXPECT(Unicode::code_point_has_property(code_point, property_white_space));
  390. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  391. }
  392. for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) {
  393. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  394. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  395. EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation));
  396. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  397. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  398. }
  399. }
  400. TEST_CASE(script)
  401. {
  402. auto script = [](StringView name) {
  403. auto script = Unicode::script_from_string(name);
  404. VERIFY(script.has_value());
  405. return *script;
  406. };
  407. auto script_latin = script("Latin"sv);
  408. auto script_latn = script("Latn"sv);
  409. EXPECT_EQ(script_latin, script_latn);
  410. auto script_cyrillic = script("Cyrillic"sv);
  411. auto script_cyrl = script("Cyrl"sv);
  412. EXPECT_EQ(script_cyrillic, script_cyrl);
  413. auto script_greek = script("Greek"sv);
  414. auto script_grek = script("Grek"sv);
  415. EXPECT_EQ(script_greek, script_grek);
  416. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  417. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  418. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  419. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  420. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  421. }
  422. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  423. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  424. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  425. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  426. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  427. }
  428. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  429. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  430. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  431. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  432. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  433. }
  434. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  435. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  436. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  437. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  438. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  439. }
  440. for (u32 code_point = 0x1f80; code_point <= 0x1fb4; ++code_point) {
  441. EXPECT(Unicode::code_point_has_script(code_point, script_greek));
  442. EXPECT(Unicode::code_point_has_script_extension(code_point, script_greek));
  443. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  444. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  445. }
  446. }
  447. TEST_CASE(block)
  448. {
  449. auto block = [](StringView name) {
  450. auto block = Unicode::block_from_string(name);
  451. VERIFY(block.has_value());
  452. return *block;
  453. };
  454. auto no_block = block("No_Block"sv);
  455. auto block_nb = block("NB"sv);
  456. EXPECT_EQ(no_block, block_nb);
  457. auto block_basic_latin = block("Basic_Latin"sv);
  458. auto block_ascii = block("ASCII"sv);
  459. EXPECT_EQ(block_basic_latin, block_ascii);
  460. auto block_greek_coptic = block("Greek_And_Coptic"sv);
  461. auto block_greek = block("Greek"sv);
  462. EXPECT_EQ(block_greek_coptic, block_greek);
  463. auto block_variation = block("Variation_Selectors_Supplement"sv);
  464. auto block_vs_sup = block("VS_Sup"sv);
  465. EXPECT_EQ(block_variation, block_vs_sup);
  466. for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point)
  467. EXPECT(Unicode::code_point_has_block(code_point, block_basic_latin));
  468. for (u32 code_point = 0xE0100; code_point <= 0xE01EF; ++code_point)
  469. EXPECT(Unicode::code_point_has_block(code_point, block_variation));
  470. for (u32 code_point = 0x0000; code_point <= 0x007F; ++code_point)
  471. EXPECT_EQ("Basic Latin"sv, Unicode::code_point_block_display_name(code_point).value());
  472. for (u32 code_point = 0x0370; code_point <= 0x03FF; ++code_point)
  473. EXPECT_EQ("Greek and Coptic"sv, Unicode::code_point_block_display_name(code_point).value());
  474. }
  475. TEST_CASE(script_extension)
  476. {
  477. auto script = [](StringView name) {
  478. auto script = Unicode::script_from_string(name);
  479. VERIFY(script.has_value());
  480. return *script;
  481. };
  482. auto script_latin = script("Latin"sv);
  483. auto script_greek = script("Greek"sv);
  484. for (u32 code_point = 0x363; code_point <= 0x36f; ++code_point) {
  485. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  486. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  487. }
  488. EXPECT(!Unicode::code_point_has_script(0x342, script_greek));
  489. EXPECT(Unicode::code_point_has_script_extension(0x342, script_greek));
  490. EXPECT(!Unicode::code_point_has_script(0x345, script_greek));
  491. EXPECT(Unicode::code_point_has_script_extension(0x345, script_greek));
  492. EXPECT(!Unicode::code_point_has_script(0x1dc0, script_greek));
  493. EXPECT(Unicode::code_point_has_script_extension(0x1dc0, script_greek));
  494. EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek));
  495. EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek));
  496. auto script_common = script("Common"sv);
  497. auto script_zyyy = script("Zyyy"sv);
  498. EXPECT_EQ(script_common, script_zyyy);
  499. EXPECT(Unicode::code_point_has_script(0x202f, script_common));
  500. EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common));
  501. EXPECT(Unicode::code_point_has_script(0x3000, script_common));
  502. EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common));
  503. auto script_inherited = script("Inherited"sv);
  504. auto script_qaai = script("Qaai"sv);
  505. auto script_zinh = script("Zinh"sv);
  506. EXPECT_EQ(script_inherited, script_qaai);
  507. EXPECT_EQ(script_inherited, script_zinh);
  508. EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited));
  509. EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited));
  510. EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
  511. EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
  512. }
  513. TEST_CASE(code_point_display_name)
  514. {
  515. auto code_point_display_name = [](u32 code_point) {
  516. auto name = Unicode::code_point_display_name(code_point);
  517. VERIFY(name.has_value());
  518. return name.release_value();
  519. };
  520. // Control code points.
  521. EXPECT_EQ(code_point_display_name(0), "NULL"sv);
  522. EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv);
  523. EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv);
  524. // Ideographic code points (which already appeared in a range in UnicodeData.txt).
  525. EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv);
  526. EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv);
  527. EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv);
  528. EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv);
  529. EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv);
  530. EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv);
  531. EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv);
  532. EXPECT(!Unicode::code_point_display_name(0x2a6df).has_value());
  533. // Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range).
  534. EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv);
  535. EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv);
  536. EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
  537. EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
  538. }