TestUnicodeCharacterTypes.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319
  1. /*
  2. * Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibTest/TestCase.h>
  7. #include <AK/StringView.h>
  8. #include <LibUnicode/CharacterTypes.h>
  9. TEST_CASE(general_category)
  10. {
  11. auto general_category = [](StringView name) {
  12. auto general_category = Unicode::general_category_from_string(name);
  13. VERIFY(general_category.has_value());
  14. return *general_category;
  15. };
  16. auto general_category_c = general_category("C"sv);
  17. auto general_category_other = general_category("Other"sv);
  18. EXPECT_EQ(general_category_c, general_category_other);
  19. auto general_category_cc = general_category("Cc"sv);
  20. auto general_category_control = general_category("Control"sv);
  21. EXPECT_EQ(general_category_cc, general_category_control);
  22. auto general_category_co = general_category("Co"sv);
  23. auto general_category_private_use = general_category("Private_Use"sv);
  24. EXPECT_EQ(general_category_co, general_category_private_use);
  25. auto general_category_cn = general_category("Cn"sv);
  26. auto general_category_unassigned = general_category("Unassigned"sv);
  27. EXPECT_EQ(general_category_cn, general_category_unassigned);
  28. auto general_category_lc = general_category("LC"sv);
  29. auto general_category_cased_letter = general_category("Cased_Letter"sv);
  30. EXPECT_EQ(general_category_lc, general_category_cased_letter);
  31. auto general_category_ll = general_category("Ll"sv);
  32. auto general_category_lowercase_letter = general_category("Lowercase_Letter"sv);
  33. EXPECT_EQ(general_category_ll, general_category_lowercase_letter);
  34. auto general_category_lu = general_category("Lu"sv);
  35. auto general_category_uppercase_letter = general_category("Uppercase_Letter"sv);
  36. EXPECT_EQ(general_category_lu, general_category_uppercase_letter);
  37. for (u32 code_point = 0; code_point <= 0x1f; ++code_point) {
  38. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  39. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cc));
  40. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  41. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  42. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  43. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  44. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  45. }
  46. for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point) {
  47. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  48. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_co));
  49. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  50. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  51. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  52. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  53. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  54. }
  55. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  56. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_c));
  57. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cn));
  58. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  59. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  60. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lc));
  61. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  62. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  63. }
  64. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  65. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  66. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_ll));
  67. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  68. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  69. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  70. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  71. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_lu));
  72. }
  73. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  74. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lc));
  75. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_lu));
  76. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_c));
  77. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cc));
  78. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_co));
  79. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cn));
  80. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_ll));
  81. }
  82. }
  83. BENCHMARK_CASE(general_category_performance)
  84. {
  85. auto general_category_cased_letter = Unicode::general_category_from_string("Cased_Letter"sv).value();
  86. for (size_t i = 0; i < 1'000'000; ++i) {
  87. for (u32 code_point = 0; code_point <= 0x1f; ++code_point)
  88. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  89. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point)
  90. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  91. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point)
  92. EXPECT(Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  93. for (u32 code_point = 0xe000; code_point <= 0xe100; ++code_point)
  94. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  95. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point)
  96. EXPECT(!Unicode::code_point_has_general_category(code_point, general_category_cased_letter));
  97. }
  98. }
  99. TEST_CASE(property)
  100. {
  101. auto property = [](StringView name) {
  102. auto property = Unicode::property_from_string(name);
  103. VERIFY(property.has_value());
  104. return *property;
  105. };
  106. auto property_any = property("Any"sv);
  107. auto property_assigned = property("Assigned"sv);
  108. auto property_ascii = property("ASCII"sv);
  109. auto property_white_space = property("White_Space"sv);
  110. auto property_wspace = property("WSpace"sv);
  111. auto property_space = property("space"sv);
  112. EXPECT_EQ(property_white_space, property_wspace);
  113. EXPECT_EQ(property_white_space, property_space);
  114. auto property_emoji_presentation = property("Emoji_Presentation"sv);
  115. auto property_epres = property("EPres"sv);
  116. EXPECT_EQ(property_emoji_presentation, property_epres);
  117. for (u32 code_point = 0; code_point <= 0x10ffff; code_point += 1000)
  118. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  119. for (u32 code_point = 0x101d0; code_point <= 0x101fd; ++code_point) {
  120. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  121. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  122. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  123. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  124. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  125. }
  126. for (u32 code_point = 0x101fe; code_point <= 0x1027f; ++code_point) {
  127. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  128. EXPECT(!Unicode::code_point_has_property(code_point, property_assigned));
  129. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  130. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  131. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  132. }
  133. for (u32 code_point = 0; code_point <= 0x7f; ++code_point) {
  134. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  135. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  136. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  137. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  138. }
  139. for (u32 code_point = 0x9; code_point <= 0xd; ++code_point) {
  140. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  141. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  142. EXPECT(Unicode::code_point_has_property(code_point, property_ascii));
  143. EXPECT(Unicode::code_point_has_property(code_point, property_white_space));
  144. EXPECT(!Unicode::code_point_has_property(code_point, property_emoji_presentation));
  145. }
  146. for (u32 code_point = 0x1f3e5; code_point <= 0x1f3f0; ++code_point) {
  147. EXPECT(Unicode::code_point_has_property(code_point, property_any));
  148. EXPECT(Unicode::code_point_has_property(code_point, property_assigned));
  149. EXPECT(Unicode::code_point_has_property(code_point, property_emoji_presentation));
  150. EXPECT(!Unicode::code_point_has_property(code_point, property_ascii));
  151. EXPECT(!Unicode::code_point_has_property(code_point, property_white_space));
  152. }
  153. }
  154. TEST_CASE(script)
  155. {
  156. auto script = [](StringView name) {
  157. auto script = Unicode::script_from_string(name);
  158. VERIFY(script.has_value());
  159. return *script;
  160. };
  161. auto script_latin = script("Latin"sv);
  162. auto script_latn = script("Latn"sv);
  163. EXPECT_EQ(script_latin, script_latn);
  164. auto script_cyrillic = script("Cyrillic"sv);
  165. auto script_cyrl = script("Cyrl"sv);
  166. EXPECT_EQ(script_cyrillic, script_cyrl);
  167. auto script_greek = script("Greek"sv);
  168. auto script_grek = script("Grek"sv);
  169. EXPECT_EQ(script_greek, script_grek);
  170. for (u32 code_point = 0x41; code_point <= 0x5a; ++code_point) {
  171. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  172. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  173. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  174. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  175. }
  176. for (u32 code_point = 0x61; code_point <= 0x7a; ++code_point) {
  177. EXPECT(Unicode::code_point_has_script(code_point, script_latin));
  178. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  179. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  180. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  181. }
  182. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  183. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  184. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  185. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  186. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  187. }
  188. for (u32 code_point = 0x400; code_point <= 0x481; ++code_point) {
  189. EXPECT(Unicode::code_point_has_script(code_point, script_cyrillic));
  190. EXPECT(Unicode::code_point_has_script_extension(code_point, script_cyrillic));
  191. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  192. EXPECT(!Unicode::code_point_has_script(code_point, script_greek));
  193. }
  194. for (u32 code_point = 0x1f80; code_point <= 0x1fb4; ++code_point) {
  195. EXPECT(Unicode::code_point_has_script(code_point, script_greek));
  196. EXPECT(Unicode::code_point_has_script_extension(code_point, script_greek));
  197. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  198. EXPECT(!Unicode::code_point_has_script(code_point, script_cyrillic));
  199. }
  200. }
  201. TEST_CASE(script_extension)
  202. {
  203. auto script = [](StringView name) {
  204. auto script = Unicode::script_from_string(name);
  205. VERIFY(script.has_value());
  206. return *script;
  207. };
  208. auto script_latin = script("Latin"sv);
  209. auto script_greek = script("Greek"sv);
  210. for (u32 code_point = 0x363; code_point <= 0x36f; ++code_point) {
  211. EXPECT(!Unicode::code_point_has_script(code_point, script_latin));
  212. EXPECT(Unicode::code_point_has_script_extension(code_point, script_latin));
  213. }
  214. EXPECT(!Unicode::code_point_has_script(0x342, script_greek));
  215. EXPECT(Unicode::code_point_has_script_extension(0x342, script_greek));
  216. EXPECT(!Unicode::code_point_has_script(0x345, script_greek));
  217. EXPECT(Unicode::code_point_has_script_extension(0x345, script_greek));
  218. EXPECT(!Unicode::code_point_has_script(0x1dc0, script_greek));
  219. EXPECT(Unicode::code_point_has_script_extension(0x1dc0, script_greek));
  220. EXPECT(!Unicode::code_point_has_script(0x1dc1, script_greek));
  221. EXPECT(Unicode::code_point_has_script_extension(0x1dc1, script_greek));
  222. auto script_common = script("Common"sv);
  223. auto script_zyyy = script("Zyyy"sv);
  224. EXPECT_EQ(script_common, script_zyyy);
  225. EXPECT(Unicode::code_point_has_script(0x202f, script_common));
  226. EXPECT(!Unicode::code_point_has_script_extension(0x202f, script_common));
  227. EXPECT(Unicode::code_point_has_script(0x3000, script_common));
  228. EXPECT(Unicode::code_point_has_script_extension(0x3000, script_common));
  229. auto script_inherited = script("Inherited"sv);
  230. auto script_qaai = script("Qaai"sv);
  231. auto script_zinh = script("Zinh"sv);
  232. EXPECT_EQ(script_inherited, script_qaai);
  233. EXPECT_EQ(script_inherited, script_zinh);
  234. EXPECT(Unicode::code_point_has_script(0x1ced, script_inherited));
  235. EXPECT(!Unicode::code_point_has_script_extension(0x1ced, script_inherited));
  236. EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
  237. EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
  238. }
  239. TEST_CASE(code_point_bidirectional_character_type)
  240. {
  241. // Left-to-right
  242. EXPECT_EQ(Unicode::bidirectional_class('A'), Unicode::BidiClass::LeftToRight);
  243. EXPECT_EQ(Unicode::bidirectional_class('z'), Unicode::BidiClass::LeftToRight);
  244. // European number
  245. EXPECT_EQ(Unicode::bidirectional_class('7'), Unicode::BidiClass::EuropeanNumber);
  246. // Whitespace
  247. EXPECT_EQ(Unicode::bidirectional_class(' '), Unicode::BidiClass::WhiteSpaceNeutral);
  248. // Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM)
  249. EXPECT_EQ(Unicode::bidirectional_class(0xFEB4), Unicode::BidiClass::RightToLeftArabic);
  250. }