TestUnicodeCharacterTypes.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <AK/StringView.h>
  8. #include <LibUnicode/CharacterTypes.h>
  9. #include <ctype.h>
  10. static void compare_to_ascii(auto& old_function, auto& new_function)
  11. {
  12. i64 result1 = 0;
  13. i64 result2 = 0;
  14. for (u32 i = 0; i < 0x80; ++i) {
  15. EXPECT_EQ(result1 = old_function(i), result2 = new_function(i));
  16. if (result1 != result2)
  17. dbgln("Function input value was {}.", i);
  18. }
  19. }
  20. TEST_CASE(to_unicode_lowercase)
  21. {
  22. compare_to_ascii(tolower, Unicode::to_unicode_lowercase);
  23. EXPECT_EQ(Unicode::to_unicode_lowercase(0x03c9u), 0x03c9u); // "ω" to "ω"
  24. EXPECT_EQ(Unicode::to_unicode_lowercase(0x03a9u), 0x03c9u); // "Ω" to "ω"
  25. // Code points encoded by ranges in UnicodeData.txt
  26. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3400u), 0x3400u);
  27. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3401u), 0x3401u);
  28. EXPECT_EQ(Unicode::to_unicode_lowercase(0x3402u), 0x3402u);
  29. EXPECT_EQ(Unicode::to_unicode_lowercase(0x4dbfu), 0x4dbfu);
  30. }
  31. TEST_CASE(to_unicode_uppercase)
  32. {
  33. compare_to_ascii(toupper, Unicode::to_unicode_uppercase);
  34. EXPECT_EQ(Unicode::to_unicode_uppercase(0x03c9u), 0x03a9u); // "ω" to "Ω"
  35. EXPECT_EQ(Unicode::to_unicode_uppercase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
  36. // Code points encoded by ranges in UnicodeData.txt
  37. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3400u), 0x3400u);
  38. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3401u), 0x3401u);
  39. EXPECT_EQ(Unicode::to_unicode_uppercase(0x3402u), 0x3402u);
  40. EXPECT_EQ(Unicode::to_unicode_uppercase(0x4dbfu), 0x4dbfu);
  41. // Code points whose uppercase and titlecase mappings actually differ.
  42. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c6u), 0x01c4u); // "dž" to "DŽ"
  43. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01c9u), 0x01c7u); // "lj" to "LJ"
  44. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01ccu), 0x01cau); // "nj" to "NJ"
  45. EXPECT_EQ(Unicode::to_unicode_uppercase(0x01f3u), 0x01f1u); // "dz" to "DZ"
  46. }
  47. TEST_CASE(to_unicode_titlecase)
  48. {
  49. compare_to_ascii(toupper, Unicode::to_unicode_titlecase);
  50. EXPECT_EQ(Unicode::to_unicode_titlecase(0x03c9u), 0x03a9u); // "ω" to "Ω"
  51. EXPECT_EQ(Unicode::to_unicode_titlecase(0x03a9u), 0x03a9u); // "Ω" to "Ω"
  52. // Code points encoded by ranges in UnicodeData.txt
  53. EXPECT_EQ(Unicode::to_unicode_titlecase(0x3400u), 0x3400u);
  54. EXPECT_EQ(Unicode::to_unicode_titlecase(0x3401u), 0x3401u);
  55. EXPECT_EQ(Unicode::to_unicode_titlecase(0x3402u), 0x3402u);
  56. EXPECT_EQ(Unicode::to_unicode_titlecase(0x4dbfu), 0x4dbfu);
  57. // Code points whose uppercase and titlecase mappings actually differ.
  58. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c6u), 0x01c5u); // "dž" to "Dž"
  59. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01c9u), 0x01c8u); // "lj" to "Lj"
  60. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01ccu), 0x01cbu); // "nj" to "Nj"
  61. EXPECT_EQ(Unicode::to_unicode_titlecase(0x01f3u), 0x01f2u); // "dz" to "Dz"
  62. }
  63. BENCHMARK_CASE(casing)
  64. {
  65. for (size_t i = 0; i < 50'000; ++i) {
  66. __test_to_unicode_lowercase();
  67. __test_to_unicode_uppercase();
  68. __test_to_unicode_titlecase();
  69. }
  70. }
  71. TEST_CASE(general_category)
  72. {
  73. auto general_category = [](StringView name) {
  74. auto general_category = Unicode::general_category_from_string(name);
  75. VERIFY(general_category.has_value());
  76. return *general_category;
  77. };
  78. auto general_category_c = general_category("C"sv);
  79. auto general_category_other = general_category("Other"sv);
  80. EXPECT_EQ(general_category_c, general_category_other);
  81. auto general_category_cc = general_category("Cc"sv);
  82. auto general_category_control = general_category("Control"sv);
  83. EXPECT_EQ(general_category_cc, general_category_control);
  84. auto general_category_co = general_category("Co"sv);
  85. auto general_category_private_use = general_category("Private_Use"sv);
  86. EXPECT_EQ(general_category_co, general_category_private_use);
  87. auto general_category_cn = general_category("Cn"sv);
  88. auto general_category_unassigned = general_category("Unassigned"sv);
  89. EXPECT_EQ(general_category_cn, general_category_unassigned);
  90. auto general_category_lc = general_category("LC"sv);
  91. auto general_category_cased_letter = general_category("Cased_Letter"sv);
  92. EXPECT_EQ(general_category_lc, general_category_cased_letter);
  93. auto general_category_ll = general_category("Ll"sv);
  94. auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv);
  95. EXPECT_EQ(general_category_ll, general_category_lowercase_letter);
  96. auto general_category_lu = general_category("Lu"sv);
  97. auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv);
  98. EXPECT_EQ(general_category_lu, general_category_uppercase_letter);
  99. for (u32 code_point = 0; code_point <= 0x1f; ++code_point) {
  100. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  101. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc));
  102. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  103. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  104. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  105. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  106. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  107. }
  108. for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) {
  109. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  110. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co));
  111. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  112. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  113. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  114. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  115. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  116. }
  117. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  118. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  119. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cn));
  120. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  121. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  122. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  123. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  124. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  125. }
  126. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  127. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  128. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll));
  129. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  130. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  131. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  132. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  133. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  134. }
  135. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  136. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  137. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu));
  138. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  139. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  140. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  141. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  142. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  143. }
  144. }
  145. BENCHMARK_CASE(general_category_performance)
  146. {
  147. auto general_category_cased_letter = Unicode::general_category_from_string("Cased_Letter"sv).value();
  148. for (size_t i = 0; i < 1'000'000; ++i) {
  149. for (u32 code_point = 0; code_point <= 0x1f; ++code_point)
  150. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  151. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point)
  152. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  153. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point)
  154. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  155. for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point)
  156. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  157. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point)
  158. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  159. }
  160. }
  161. TEST_CASE(property)
  162. {
  163. auto property = [](StringView name) {
  164. auto property = Unicode::property_from_string(name);
  165. VERIFY(property.has_value());
  166. return *property;
  167. };
  168. auto property_any = property("Any"sv);
  169. auto property_assigned = property("Assigned"sv);
  170. auto property_ascii = property("ASCII"sv);
  171. auto property_white_space = property("White_Space"sv);
  172. auto property_wspace = property("WSpace"sv);
  173. auto property_space = property("space"sv);
  174. EXPECT_EQ(property_white_space, property_wspace);
  175. EXPECT_EQ(property_white_space, property_space);
  176. auto property_emoji_presentation = property("Emoji_Presentation"sv);
  177. auto property_epres = property("EPres"sv);
  178. EXPECT_EQ(property_emoji_presentation, property_epres);
  179. for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000)
  180. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  181. for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) {
  182. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  183. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  184. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  185. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  186. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  187. }
  188. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  189. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  190. EXPECT(!Unicode::code_point_has_property(code_point, property_assigned));
  191. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  192. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  193. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  194. }
  195. for (u32 code_point = 0; code_point <= 0x7f; ++code_point) {
  196. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  197. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  198. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  199. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  200. }
  201. for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) {
  202. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  203. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  204. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  205. EXPECT(Unicode::code_point_has_property(code_point, property_white_space));
  206. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  207. }
  208. for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) {
  209. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  210. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  211. EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation));
  212. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  213. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  214. }
  215. }
  216. TEST_CASE(script)
  217. {
  218. auto script = [](StringView name) {
  219. auto script = Unicode::script_from_string(name);
  220. VERIFY(script.has_value());
  221. return *script;
  222. };
  223. auto script_latin = script("Latin"sv);
  224. auto script_latn = script("Latn"sv);
  225. EXPECT_EQ(script_latin, script_latn);
  226. auto script_cyrillic = script("Cyrillic"sv);
  227. auto script_cyrl = script("Cyrl"sv);
  228. EXPECT_EQ(script_cyrillic, script_cyrl);
  229. auto script_greek = script("Greek"sv);
  230. auto script_grek = script("Grek"sv);
  231. EXPECT_EQ(script_greek, script_grek);
  232. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  233. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  234. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  235. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  236. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  237. }
  238. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  239. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  240. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  241. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  242. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  243. }
  244. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  245. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  246. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  247. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  248. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  249. }
  250. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  251. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  252. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  253. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  254. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  255. }
  256. for (u32 code_point = 0x1f80; code_point <= 0x1fb4; ++code_point) {
  257. EXPECT(Unicode::code_point_has_script(code_point, script_greek));
  258. EXPECT(Unicode::code_point_has_script_extension(code_point, script_greek));
  259. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  260. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  261. }
  262. }
  263. TEST_CASE(script_extension)
  264. {
  265. auto script = [](StringView name) {
  266. auto script = Unicode::script_from_string(name);
  267. VERIFY(script.has_value());
  268. return *script;
  269. };
  270. auto script_latin = script("Latin"sv);
  271. auto script_greek = script("Greek"sv);
  272. for (u32 code_point = 0x363; code_point <= 0x36f; ++code_point) {
  273. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  274. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  275. }
  276. EXPECT(!Unicode::code_point_has_script(0x342, script_greek));
  277. EXPECT(Unicode::code_point_has_script_extension(0x342, script_greek));
  278. EXPECT(!Unicode::code_point_has_script(0x345, script_greek));
  279. EXPECT(Unicode::code_point_has_script_extension(0x345, script_greek));
  280. EXPECT(!Unicode::code_point_has_script(0x1dc0, script_greek));
  281. EXPECT(Unicode::code_point_has_script_extension(0x1dc0, script_greek));
  282. EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek));
  283. EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek));
  284. auto script_common = script("Common"sv);
  285. auto script_zyyy = script("Zyyy"sv);
  286. EXPECT_EQ(script_common, script_zyyy);
  287. EXPECT(Unicode::code_point_has_script(0x202f, script_common));
  288. EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common));
  289. EXPECT(Unicode::code_point_has_script(0x3000, script_common));
  290. EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common));
  291. auto script_inherited = script("Inherited"sv);
  292. auto script_qaai = script("Qaai"sv);
  293. auto script_zinh = script("Zinh"sv);
  294. EXPECT_EQ(script_inherited, script_qaai);
  295. EXPECT_EQ(script_inherited, script_zinh);
  296. EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited));
  297. EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited));
  298. EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
  299. EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
  300. }
  301. TEST_CASE(code_point_bidirectional_character_type)
  302. {
  303. auto code_point_bidi_class = [](u32 code_point) {
  304. auto bidi_class = Unicode::bidirectional_class(code_point);
  305. VERIFY(bidi_class.has_value());
  306. return bidi_class.release_value();
  307. };
  308. auto bidi_class_from_string = [](StringView name) {
  309. auto result = Unicode::bidirectional_class_from_string(name);
  310. VERIFY(result.has_value());
  311. return result.release_value();
  312. };
  313. // Left-to-right
  314. EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv));
  315. EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv));
  316. // European number
  317. EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv));
  318. // Whitespace
  319. EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv));
  320. // Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM)
  321. EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv));
  322. }