GenerateUnicodeLocale.cpp 63 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "GeneratorUtil.h"
  7. #include <AK/AllOf.h>
  8. #include <AK/CharacterTypes.h>
  9. #include <AK/Format.h>
  10. #include <AK/HashMap.h>
  11. #include <AK/JsonObject.h>
  12. #include <AK/JsonParser.h>
  13. #include <AK/JsonValue.h>
  14. #include <AK/LexicalPath.h>
  15. #include <AK/QuickSort.h>
  16. #include <AK/SourceGenerator.h>
  17. #include <AK/String.h>
  18. #include <AK/StringBuilder.h>
  19. #include <LibCore/ArgsParser.h>
  20. #include <LibCore/DirIterator.h>
  21. #include <LibCore/File.h>
  22. #include <LibUnicode/Locale.h>
  23. #include <math.h>
  24. using StringIndexType = u16;
  25. constexpr auto s_string_index_type = "u16"sv;
  26. struct NumberFormat : public Unicode::NumberFormat {
  27. using Base = Unicode::NumberFormat;
  28. static Base::Plurality plurality_from_string(StringView plurality)
  29. {
  30. if (plurality == "other"sv)
  31. return Base::Plurality::Other;
  32. if (plurality == "1"sv)
  33. return Base::Plurality::Single;
  34. if (plurality == "zero"sv)
  35. return Base::Plurality::Zero;
  36. if (plurality == "one"sv)
  37. return Base::Plurality::One;
  38. if (plurality == "two"sv)
  39. return Base::Plurality::Two;
  40. if (plurality == "few"sv)
  41. return Base::Plurality::Few;
  42. if (plurality == "many"sv)
  43. return Base::Plurality::Many;
  44. VERIFY_NOT_REACHED();
  45. }
  46. StringIndexType format_index { 0 };
  47. };
  48. struct NumberSystem {
  49. StringIndexType system { 0 };
  50. HashMap<String, StringIndexType> symbols {};
  51. Vector<NumberFormat> decimal_long_formats {};
  52. Vector<NumberFormat> decimal_short_formats {};
  53. Vector<NumberFormat> currency_short_formats {};
  54. NumberFormat percent_format {};
  55. };
  56. struct ListPatterns {
  57. String type;
  58. String style;
  59. StringIndexType start { 0 };
  60. StringIndexType middle { 0 };
  61. StringIndexType end { 0 };
  62. StringIndexType pair { 0 };
  63. };
  64. struct Locale {
  65. String language;
  66. Optional<String> territory;
  67. Optional<String> variant;
  68. HashMap<String, StringIndexType> languages;
  69. HashMap<String, StringIndexType> territories;
  70. HashMap<String, StringIndexType> scripts;
  71. HashMap<String, StringIndexType> currencies;
  72. HashMap<String, StringIndexType> keywords;
  73. HashMap<String, NumberSystem> number_systems;
  74. Vector<ListPatterns> list_patterns;
  75. };
  76. struct CanonicalLanguageID {
  77. StringIndexType language { 0 };
  78. StringIndexType script { 0 };
  79. StringIndexType region { 0 };
  80. Vector<StringIndexType> variants {};
  81. };
  82. struct LanguageMapping {
  83. CanonicalLanguageID key {};
  84. CanonicalLanguageID alias {};
  85. };
  86. struct UnicodeLocaleData {
  87. Vector<String> unique_strings;
  88. HashMap<StringView, StringIndexType> unique_string_indices;
  89. HashMap<String, Locale> locales;
  90. Vector<String> languages;
  91. Vector<String> territories;
  92. Vector<String> scripts;
  93. Vector<String> variants;
  94. Vector<String> currencies;
  95. Vector<String> keywords;
  96. Vector<String> numeric_symbols;
  97. Vector<String> list_pattern_types;
  98. Vector<String> list_pattern_styles;
  99. HashMap<String, StringIndexType> language_aliases;
  100. HashMap<String, StringIndexType> territory_aliases;
  101. HashMap<String, StringIndexType> script_aliases;
  102. HashMap<String, StringIndexType> variant_aliases;
  103. HashMap<String, StringIndexType> subdivision_aliases;
  104. Vector<LanguageMapping> complex_mappings;
  105. Vector<LanguageMapping> likely_subtags;
  106. size_t max_variant_size { 0 };
  107. };
  108. static StringIndexType ensure_unique_string(UnicodeLocaleData& locale_data, String string)
  109. {
  110. // We maintain a set of unique strings in two structures: a vector which owns the unique string,
  111. // and a hash map which maps that string to its index in the vector. The vector is to ensure the
  112. // strings are generated in an easily known order, and the map is to allow quickly deciding if a
  113. // string is actually unique (otherwise, we'd have to linear-search the vector for each string).
  114. //
  115. // Also note that index 0 will be reserved for the empty string, so the index returned from this
  116. // method is actually the real index in the vector + 1.
  117. if (auto index = locale_data.unique_string_indices.get(string); index.has_value())
  118. return *index;
  119. locale_data.unique_strings.append(move(string));
  120. size_t index = locale_data.unique_strings.size();
  121. // There are currently on the order of 46K unique strings in UnicodeLocale.cpp.
  122. // If that number reaches 2^16, bump the StringIndexType alias to a u32.
  123. VERIFY(index < NumericLimits<StringIndexType>::max());
  124. auto string_index = static_cast<StringIndexType>(index);
  125. locale_data.unique_string_indices.set(locale_data.unique_strings.last(), string_index);
  126. return string_index;
  127. }
  128. static StringView get_unique_string(UnicodeLocaleData& locale_data, StringIndexType index)
  129. {
  130. if (index == 0)
  131. return {};
  132. VERIFY(index <= locale_data.unique_strings.size());
  133. return locale_data.unique_strings.at(index - 1);
  134. }
  135. static Optional<CanonicalLanguageID> parse_language(UnicodeLocaleData& locale_data, StringView language)
  136. {
  137. CanonicalLanguageID language_id {};
  138. auto segments = language.split_view('-');
  139. VERIFY(!segments.is_empty());
  140. size_t index = 0;
  141. if (Unicode::is_unicode_language_subtag(segments[index])) {
  142. language_id.language = ensure_unique_string(locale_data, segments[index]);
  143. if (segments.size() == ++index)
  144. return language_id;
  145. } else {
  146. return {};
  147. }
  148. if (Unicode::is_unicode_script_subtag(segments[index])) {
  149. language_id.script = ensure_unique_string(locale_data, segments[index]);
  150. if (segments.size() == ++index)
  151. return language_id;
  152. }
  153. if (Unicode::is_unicode_region_subtag(segments[index])) {
  154. language_id.region = ensure_unique_string(locale_data, segments[index]);
  155. if (segments.size() == ++index)
  156. return language_id;
  157. }
  158. while (index < segments.size()) {
  159. if (!Unicode::is_unicode_variant_subtag(segments[index]))
  160. return {};
  161. language_id.variants.append(ensure_unique_string(locale_data, segments[index++]));
  162. }
  163. return language_id;
  164. }
  165. static Optional<LanguageMapping> parse_language_mapping(UnicodeLocaleData& locale_data, StringView key, StringView alias)
  166. {
  167. auto parsed_key = parse_language(locale_data, key);
  168. if (!parsed_key.has_value())
  169. return {};
  170. auto parsed_alias = parse_language(locale_data, alias);
  171. if (!parsed_alias.has_value())
  172. return {};
  173. return LanguageMapping { parsed_key.release_value(), parsed_alias.release_value() };
  174. }
  175. static void parse_core_aliases(String core_supplemental_path, UnicodeLocaleData& locale_data)
  176. {
  177. LexicalPath core_aliases_path(move(core_supplemental_path));
  178. core_aliases_path = core_aliases_path.append("aliases.json"sv);
  179. VERIFY(Core::File::exists(core_aliases_path.string()));
  180. auto core_aliases_file_or_error = Core::File::open(core_aliases_path.string(), Core::OpenMode::ReadOnly);
  181. VERIFY(!core_aliases_file_or_error.is_error());
  182. auto core_aliases = JsonParser(core_aliases_file_or_error.value()->read_all()).parse();
  183. VERIFY(core_aliases.has_value());
  184. auto const& supplemental_object = core_aliases->as_object().get("supplemental"sv);
  185. auto const& metadata_object = supplemental_object.as_object().get("metadata"sv);
  186. auto const& alias_object = metadata_object.as_object().get("alias"sv);
  187. auto append_aliases = [&](auto& alias_object, auto& alias_map) {
  188. alias_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  189. auto alias = value.as_object().get("_replacement"sv).as_string();
  190. if (key.contains('-')) {
  191. auto mapping = parse_language_mapping(locale_data, key, alias);
  192. if (!mapping.has_value())
  193. return;
  194. locale_data.max_variant_size = max(mapping->key.variants.size(), locale_data.max_variant_size);
  195. locale_data.max_variant_size = max(mapping->alias.variants.size(), locale_data.max_variant_size);
  196. locale_data.complex_mappings.append(mapping.release_value());
  197. } else {
  198. alias_map.set(key, ensure_unique_string(locale_data, alias));
  199. }
  200. });
  201. };
  202. append_aliases(alias_object.as_object().get("languageAlias"sv), locale_data.language_aliases);
  203. append_aliases(alias_object.as_object().get("territoryAlias"sv), locale_data.territory_aliases);
  204. append_aliases(alias_object.as_object().get("scriptAlias"sv), locale_data.script_aliases);
  205. append_aliases(alias_object.as_object().get("variantAlias"sv), locale_data.variant_aliases);
  206. append_aliases(alias_object.as_object().get("subdivisionAlias"sv), locale_data.subdivision_aliases);
  207. }
  208. static void parse_likely_subtags(String core_supplemental_path, UnicodeLocaleData& locale_data)
  209. {
  210. LexicalPath likely_subtags_path(move(core_supplemental_path));
  211. likely_subtags_path = likely_subtags_path.append("likelySubtags.json"sv);
  212. VERIFY(Core::File::exists(likely_subtags_path.string()));
  213. auto likely_subtags_file_or_error = Core::File::open(likely_subtags_path.string(), Core::OpenMode::ReadOnly);
  214. VERIFY(!likely_subtags_file_or_error.is_error());
  215. auto likely_subtags = JsonParser(likely_subtags_file_or_error.value()->read_all()).parse();
  216. VERIFY(likely_subtags.has_value());
  217. auto const& supplemental_object = likely_subtags->as_object().get("supplemental"sv);
  218. auto const& likely_subtags_object = supplemental_object.as_object().get("likelySubtags"sv);
  219. likely_subtags_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  220. auto mapping = parse_language_mapping(locale_data, key, value.as_string());
  221. if (!mapping.has_value())
  222. return;
  223. locale_data.max_variant_size = max(mapping->key.variants.size(), locale_data.max_variant_size);
  224. locale_data.max_variant_size = max(mapping->alias.variants.size(), locale_data.max_variant_size);
  225. locale_data.likely_subtags.append(mapping.release_value());
  226. });
  227. }
  228. static void parse_identity(String locale_path, UnicodeLocaleData& locale_data, Locale& locale)
  229. {
  230. LexicalPath languages_path(move(locale_path)); // Note: Every JSON file defines identity data, so we can use any of them.
  231. languages_path = languages_path.append("languages.json"sv);
  232. VERIFY(Core::File::exists(languages_path.string()));
  233. auto languages_file_or_error = Core::File::open(languages_path.string(), Core::OpenMode::ReadOnly);
  234. VERIFY(!languages_file_or_error.is_error());
  235. auto languages = JsonParser(languages_file_or_error.value()->read_all()).parse();
  236. VERIFY(languages.has_value());
  237. auto const& main_object = languages->as_object().get("main"sv);
  238. auto const& locale_object = main_object.as_object().get(languages_path.parent().basename());
  239. auto const& identity_object = locale_object.as_object().get("identity"sv);
  240. auto const& language_string = identity_object.as_object().get("language"sv);
  241. auto const& territory_string = identity_object.as_object().get("territory"sv);
  242. auto const& variant_string = identity_object.as_object().get("variant"sv);
  243. locale.language = language_string.as_string();
  244. if (!locale_data.languages.contains_slow(locale.language))
  245. locale_data.languages.append(locale.language);
  246. if (territory_string.is_string()) {
  247. locale.territory = territory_string.as_string();
  248. if (!locale_data.territories.contains_slow(*locale.territory))
  249. locale_data.territories.append(*locale.territory);
  250. }
  251. if (variant_string.is_string()) {
  252. locale.variant = variant_string.as_string();
  253. if (!locale_data.variants.contains_slow(*locale.variant))
  254. locale_data.variants.append(*locale.variant);
  255. }
  256. }
  257. static void parse_locale_languages(String locale_path, UnicodeLocaleData& locale_data, Locale& locale)
  258. {
  259. LexicalPath languages_path(move(locale_path));
  260. languages_path = languages_path.append("languages.json"sv);
  261. VERIFY(Core::File::exists(languages_path.string()));
  262. auto languages_file_or_error = Core::File::open(languages_path.string(), Core::OpenMode::ReadOnly);
  263. VERIFY(!languages_file_or_error.is_error());
  264. auto languages = JsonParser(languages_file_or_error.value()->read_all()).parse();
  265. VERIFY(languages.has_value());
  266. auto const& main_object = languages->as_object().get("main"sv);
  267. auto const& locale_object = main_object.as_object().get(languages_path.parent().basename());
  268. auto const& locale_display_names_object = locale_object.as_object().get("localeDisplayNames"sv);
  269. auto const& languages_object = locale_display_names_object.as_object().get("languages"sv);
  270. languages_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  271. if (!locale_data.languages.contains_slow(key))
  272. return;
  273. auto index = ensure_unique_string(locale_data, value.as_string());
  274. locale.languages.set(key, index);
  275. });
  276. }
  277. static void parse_locale_territories(String locale_path, UnicodeLocaleData& locale_data, Locale& locale)
  278. {
  279. LexicalPath territories_path(move(locale_path));
  280. territories_path = territories_path.append("territories.json"sv);
  281. VERIFY(Core::File::exists(territories_path.string()));
  282. auto territories_file_or_error = Core::File::open(territories_path.string(), Core::OpenMode::ReadOnly);
  283. VERIFY(!territories_file_or_error.is_error());
  284. auto territories = JsonParser(territories_file_or_error.value()->read_all()).parse();
  285. VERIFY(territories.has_value());
  286. auto const& main_object = territories->as_object().get("main"sv);
  287. auto const& locale_object = main_object.as_object().get(territories_path.parent().basename());
  288. auto const& locale_display_names_object = locale_object.as_object().get("localeDisplayNames"sv);
  289. auto const& territories_object = locale_display_names_object.as_object().get("territories"sv);
  290. territories_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  291. if (!locale_data.territories.contains_slow(key))
  292. return;
  293. auto index = ensure_unique_string(locale_data, value.as_string());
  294. locale.territories.set(key, index);
  295. });
  296. }
  297. static void parse_locale_scripts(String locale_path, UnicodeLocaleData& locale_data, Locale& locale)
  298. {
  299. LexicalPath scripts_path(move(locale_path));
  300. scripts_path = scripts_path.append("scripts.json"sv);
  301. VERIFY(Core::File::exists(scripts_path.string()));
  302. auto scripts_file_or_error = Core::File::open(scripts_path.string(), Core::OpenMode::ReadOnly);
  303. VERIFY(!scripts_file_or_error.is_error());
  304. auto scripts = JsonParser(scripts_file_or_error.value()->read_all()).parse();
  305. VERIFY(scripts.has_value());
  306. auto const& main_object = scripts->as_object().get("main"sv);
  307. auto const& locale_object = main_object.as_object().get(scripts_path.parent().basename());
  308. auto const& locale_display_names_object = locale_object.as_object().get("localeDisplayNames"sv);
  309. auto const& scripts_object = locale_display_names_object.as_object().get("scripts"sv);
  310. scripts_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  311. auto index = ensure_unique_string(locale_data, value.as_string());
  312. locale.scripts.set(key, index);
  313. if (!locale_data.scripts.contains_slow(key))
  314. locale_data.scripts.append(key);
  315. });
  316. }
  317. static void parse_locale_list_patterns(String misc_path, UnicodeLocaleData& locale_data, Locale& locale)
  318. {
  319. LexicalPath list_patterns_path(move(misc_path));
  320. list_patterns_path = list_patterns_path.append("listPatterns.json"sv);
  321. VERIFY(Core::File::exists(list_patterns_path.string()));
  322. auto list_patterns_file_or_error = Core::File::open(list_patterns_path.string(), Core::OpenMode::ReadOnly);
  323. VERIFY(!list_patterns_file_or_error.is_error());
  324. auto list_patterns = JsonParser(list_patterns_file_or_error.value()->read_all()).parse();
  325. VERIFY(list_patterns.has_value());
  326. auto const& main_object = list_patterns->as_object().get("main"sv);
  327. auto const& locale_object = main_object.as_object().get(list_patterns_path.parent().basename());
  328. auto const& list_patterns_object = locale_object.as_object().get("listPatterns"sv);
  329. auto list_pattern_type = [](StringView key) {
  330. if (key.contains("type-standard"sv))
  331. return "conjunction"sv;
  332. if (key.contains("type-or"sv))
  333. return "disjunction"sv;
  334. if (key.contains("type-unit"sv))
  335. return "unit"sv;
  336. VERIFY_NOT_REACHED();
  337. };
  338. auto list_pattern_style = [](StringView key) {
  339. if (key.contains("short"sv))
  340. return "short"sv;
  341. if (key.contains("narrow"sv))
  342. return "narrow"sv;
  343. return "long"sv;
  344. };
  345. list_patterns_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  346. auto type = list_pattern_type(key);
  347. auto style = list_pattern_style(key);
  348. auto start = ensure_unique_string(locale_data, value.as_object().get("start"sv).as_string());
  349. auto middle = ensure_unique_string(locale_data, value.as_object().get("middle"sv).as_string());
  350. auto end = ensure_unique_string(locale_data, value.as_object().get("end"sv).as_string());
  351. auto pair = ensure_unique_string(locale_data, value.as_object().get("2"sv).as_string());
  352. if (!locale_data.list_pattern_types.contains_slow(type))
  353. locale_data.list_pattern_types.append(type);
  354. if (!locale_data.list_pattern_styles.contains_slow(style))
  355. locale_data.list_pattern_styles.append(style);
  356. locale.list_patterns.append({ move(type), move(style), move(start), move(middle), move(end), move(pair) });
  357. });
  358. }
  359. static void parse_locale_currencies(String numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
  360. {
  361. LexicalPath currencies_path(move(numbers_path));
  362. currencies_path = currencies_path.append("currencies.json"sv);
  363. VERIFY(Core::File::exists(currencies_path.string()));
  364. auto currencies_file_or_error = Core::File::open(currencies_path.string(), Core::OpenMode::ReadOnly);
  365. VERIFY(!currencies_file_or_error.is_error());
  366. auto currencies = JsonParser(currencies_file_or_error.value()->read_all()).parse();
  367. VERIFY(currencies.has_value());
  368. auto const& main_object = currencies->as_object().get("main"sv);
  369. auto const& locale_object = main_object.as_object().get(currencies_path.parent().basename());
  370. auto const& locale_numbers_object = locale_object.as_object().get("numbers"sv);
  371. auto const& currencies_object = locale_numbers_object.as_object().get("currencies"sv);
  372. currencies_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  373. auto const& display_name = value.as_object().get("displayName"sv);
  374. auto index = ensure_unique_string(locale_data, display_name.as_string());
  375. locale.currencies.set(key, index);
  376. if (!locale_data.currencies.contains_slow(key))
  377. locale_data.currencies.append(key);
  378. });
  379. }
  380. static void parse_numeric_keywords(String locale_numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
  381. {
  382. static constexpr StringView key = "nu"sv;
  383. LexicalPath numbers_path(move(locale_numbers_path));
  384. numbers_path = numbers_path.append("numbers.json"sv);
  385. VERIFY(Core::File::exists(numbers_path.string()));
  386. auto numbers_file_or_error = Core::File::open(numbers_path.string(), Core::OpenMode::ReadOnly);
  387. VERIFY(!numbers_file_or_error.is_error());
  388. auto numbers = JsonParser(numbers_file_or_error.value()->read_all()).parse();
  389. VERIFY(numbers.has_value());
  390. auto const& main_object = numbers->as_object().get("main"sv);
  391. auto const& locale_object = main_object.as_object().get(numbers_path.parent().basename());
  392. auto const& locale_numbers_object = locale_object.as_object().get("numbers"sv);
  393. auto const& default_numbering_system_object = locale_numbers_object.as_object().get("defaultNumberingSystem"sv);
  394. auto const& other_numbering_systems_object = locale_numbers_object.as_object().get("otherNumberingSystems"sv);
  395. Vector<String> keyword_values {};
  396. keyword_values.append(default_numbering_system_object.as_string());
  397. other_numbering_systems_object.as_object().for_each_member([&](auto const&, JsonValue const& value) {
  398. auto keyword_value = value.as_string();
  399. if (!keyword_values.contains_slow(keyword_value))
  400. keyword_values.append(move(keyword_value));
  401. });
  402. locale_numbers_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  403. if (!key.starts_with("defaultNumberingSystem-alt-"sv))
  404. return;
  405. auto keyword_value = value.as_string();
  406. if (!keyword_values.contains_slow(keyword_value))
  407. keyword_values.append(move(keyword_value));
  408. });
  409. StringBuilder builder;
  410. builder.join(',', keyword_values);
  411. auto index = ensure_unique_string(locale_data, builder.build());
  412. locale.keywords.set(key, index);
  413. if (!locale_data.keywords.contains_slow(key))
  414. locale_data.keywords.append(key);
  415. }
  416. static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
  417. {
  418. LexicalPath numbers_path(move(locale_numbers_path));
  419. numbers_path = numbers_path.append("numbers.json"sv);
  420. VERIFY(Core::File::exists(numbers_path.string()));
  421. auto numbers_file_or_error = Core::File::open(numbers_path.string(), Core::OpenMode::ReadOnly);
  422. VERIFY(!numbers_file_or_error.is_error());
  423. auto numbers = JsonParser(numbers_file_or_error.value()->read_all()).parse();
  424. VERIFY(numbers.has_value());
  425. auto const& main_object = numbers->as_object().get("main"sv);
  426. auto const& locale_object = main_object.as_object().get(numbers_path.parent().basename());
  427. auto const& locale_numbers_object = locale_object.as_object().get("numbers"sv);
  428. auto ensure_number_system = [&](auto const& system) -> NumberSystem& {
  429. return locale.number_systems.ensure(system, [&]() {
  430. auto system_index = ensure_unique_string(locale_data, system);
  431. return NumberSystem { .system = system_index };
  432. });
  433. };
  434. auto parse_number_format = [&](auto const& format_object) {
  435. Vector<NumberFormat> result;
  436. result.ensure_capacity(format_object.size());
  437. format_object.for_each_member([&](auto const& key, JsonValue const& value) {
  438. auto split_key = key.split_view('-');
  439. VERIFY(split_key.size() == 3);
  440. auto type = split_key[0].template to_uint<u64>().value();
  441. VERIFY(type % 10 == 0);
  442. NumberFormat format {};
  443. format.magnitude = static_cast<u8>(log10(type));
  444. format.plurality = NumberFormat::plurality_from_string(split_key[2]);
  445. format.format_index = ensure_unique_string(locale_data, value.as_string());
  446. result.append(move(format));
  447. });
  448. return result;
  449. };
  450. locale_numbers_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  451. constexpr auto symbols_prefix = "symbols-numberSystem-"sv;
  452. constexpr auto decimal_formats_prefix = "decimalFormats-numberSystem-"sv;
  453. constexpr auto currency_formats_prefix = "currencyFormats-numberSystem-"sv;
  454. constexpr auto percent_formats_prefix = "percentFormats-numberSystem-"sv;
  455. if (key.starts_with(symbols_prefix)) {
  456. auto system = key.substring(symbols_prefix.length());
  457. auto& number_system = ensure_number_system(system);
  458. value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) {
  459. auto symbol_index = ensure_unique_string(locale_data, localization.as_string());
  460. number_system.symbols.set(symbol, symbol_index);
  461. if (!locale_data.numeric_symbols.contains_slow(symbol))
  462. locale_data.numeric_symbols.append(symbol);
  463. });
  464. } else if (key.starts_with(decimal_formats_prefix)) {
  465. auto system = key.substring(decimal_formats_prefix.length());
  466. auto& number_system = ensure_number_system(system);
  467. auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv);
  468. number_system.decimal_long_formats = parse_number_format(long_format.as_object());
  469. auto const& short_format = value.as_object().get("short"sv).as_object().get("decimalFormat"sv);
  470. number_system.decimal_short_formats = parse_number_format(short_format.as_object());
  471. } else if (key.starts_with(currency_formats_prefix)) {
  472. if (value.as_object().has("short"sv)) {
  473. auto system = key.substring(currency_formats_prefix.length());
  474. auto& number_system = ensure_number_system(system);
  475. auto const& short_format = value.as_object().get("short"sv).as_object().get("standard"sv);
  476. number_system.currency_short_formats = parse_number_format(short_format.as_object());
  477. }
  478. } else if (key.starts_with(percent_formats_prefix)) {
  479. auto system = key.substring(decimal_formats_prefix.length());
  480. auto& number_system = ensure_number_system(system);
  481. auto format_object = value.as_object().get("standard"sv);
  482. number_system.percent_format.format_index = ensure_unique_string(locale_data, format_object.as_string());
  483. }
  484. });
  485. }
  486. static void parse_default_content_locales(String core_path, UnicodeLocaleData& locale_data)
  487. {
  488. LexicalPath default_content_path(move(core_path));
  489. default_content_path = default_content_path.append("defaultContent.json"sv);
  490. VERIFY(Core::File::exists(default_content_path.string()));
  491. auto default_content_file_or_error = Core::File::open(default_content_path.string(), Core::OpenMode::ReadOnly);
  492. VERIFY(!default_content_file_or_error.is_error());
  493. auto default_content = JsonParser(default_content_file_or_error.value()->read_all()).parse();
  494. VERIFY(default_content.has_value());
  495. auto const& default_content_array = default_content->as_object().get("defaultContent"sv);
  496. default_content_array.as_array().for_each([&](JsonValue const& value) {
  497. auto locale = value.as_string();
  498. StringView default_locale = locale;
  499. while (true) {
  500. if (locale_data.locales.contains(default_locale))
  501. break;
  502. auto pos = default_locale.find_last('-');
  503. if (!pos.has_value())
  504. return;
  505. default_locale = default_locale.substring_view(0, *pos);
  506. }
  507. locale_data.locales.set(locale, locale_data.locales.get(default_locale).value());
  508. });
  509. }
  510. static Core::DirIterator path_to_dir_iterator(String path)
  511. {
  512. LexicalPath lexical_path(move(path));
  513. lexical_path = lexical_path.append("main"sv);
  514. VERIFY(Core::File::is_directory(lexical_path.string()));
  515. Core::DirIterator iterator(lexical_path.string(), Core::DirIterator::SkipParentAndBaseDir);
  516. if (iterator.has_error()) {
  517. warnln("{}: {}", lexical_path.string(), iterator.error_string());
  518. VERIFY_NOT_REACHED();
  519. }
  520. return iterator;
  521. }
  522. static void parse_all_locales(String core_path, String locale_names_path, String misc_path, String numbers_path, UnicodeLocaleData& locale_data)
  523. {
  524. auto identity_iterator = path_to_dir_iterator(locale_names_path);
  525. auto locale_names_iterator = path_to_dir_iterator(move(locale_names_path));
  526. auto misc_iterator = path_to_dir_iterator(move(misc_path));
  527. auto numbers_iterator = path_to_dir_iterator(move(numbers_path));
  528. LexicalPath core_supplemental_path(core_path);
  529. core_supplemental_path = core_supplemental_path.append("supplemental"sv);
  530. VERIFY(Core::File::is_directory(core_supplemental_path.string()));
  531. parse_core_aliases(core_supplemental_path.string(), locale_data);
  532. parse_likely_subtags(core_supplemental_path.string(), locale_data);
  533. auto remove_variants_from_path = [&](String path) -> Optional<String> {
  534. auto parsed_locale = parse_language(locale_data, LexicalPath::basename(path));
  535. if (!parsed_locale.has_value())
  536. return {};
  537. StringBuilder builder;
  538. builder.append(get_unique_string(locale_data, parsed_locale->language));
  539. if (auto script = get_unique_string(locale_data, parsed_locale->script); !script.is_empty())
  540. builder.appendff("-{}", script);
  541. if (auto region = get_unique_string(locale_data, parsed_locale->region); !region.is_empty())
  542. builder.appendff("-{}", region);
  543. return builder.build();
  544. };
  545. while (identity_iterator.has_next()) {
  546. auto locale_path = identity_iterator.next_full_path();
  547. VERIFY(Core::File::is_directory(locale_path));
  548. auto language = remove_variants_from_path(locale_path);
  549. if (!language.has_value())
  550. continue;
  551. auto& locale = locale_data.locales.ensure(*language);
  552. parse_identity(locale_path, locale_data, locale);
  553. }
  554. while (locale_names_iterator.has_next()) {
  555. auto locale_path = locale_names_iterator.next_full_path();
  556. VERIFY(Core::File::is_directory(locale_path));
  557. auto language = remove_variants_from_path(locale_path);
  558. if (!language.has_value())
  559. continue;
  560. auto& locale = locale_data.locales.ensure(*language);
  561. parse_locale_languages(locale_path, locale_data, locale);
  562. parse_locale_territories(locale_path, locale_data, locale);
  563. parse_locale_scripts(locale_path, locale_data, locale);
  564. }
  565. while (misc_iterator.has_next()) {
  566. auto misc_path = misc_iterator.next_full_path();
  567. VERIFY(Core::File::is_directory(misc_path));
  568. auto language = remove_variants_from_path(misc_path);
  569. if (!language.has_value())
  570. continue;
  571. auto& locale = locale_data.locales.ensure(*language);
  572. parse_locale_list_patterns(misc_path, locale_data, locale);
  573. }
  574. while (numbers_iterator.has_next()) {
  575. auto numbers_path = numbers_iterator.next_full_path();
  576. VERIFY(Core::File::is_directory(numbers_path));
  577. auto language = remove_variants_from_path(numbers_path);
  578. if (!language.has_value())
  579. continue;
  580. auto& locale = locale_data.locales.ensure(*language);
  581. parse_locale_currencies(numbers_path, locale_data, locale);
  582. parse_numeric_keywords(numbers_path, locale_data, locale);
  583. parse_number_systems(numbers_path, locale_data, locale);
  584. }
  585. parse_default_content_locales(move(core_path), locale_data);
  586. }
  587. static String format_identifier(StringView owner, String identifier)
  588. {
  589. identifier = identifier.replace("-"sv, "_"sv, true);
  590. if (all_of(identifier, is_ascii_digit))
  591. return String::formatted("{}_{}", owner[0], identifier);
  592. if (is_ascii_lower_alpha(identifier[0]))
  593. return String::formatted("{:c}{}", to_ascii_uppercase(identifier[0]), identifier.substring_view(1));
  594. return identifier;
  595. }
  596. static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData& locale_data)
  597. {
  598. StringBuilder builder;
  599. SourceGenerator generator { builder };
  600. auto generate_enum = [&](StringView name, StringView default_, Vector<String>& values) {
  601. quick_sort(values);
  602. generator.set("name", name);
  603. generator.set("underlying", ((values.size() + !default_.is_empty()) < 256) ? "u8"sv : "u16"sv);
  604. generator.append(R"~~~(
  605. enum class @name@ : @underlying@ {)~~~");
  606. if (!default_.is_empty()) {
  607. generator.set("default", default_);
  608. generator.append(R"~~~(
  609. @default@,)~~~");
  610. }
  611. for (auto const& value : values) {
  612. generator.set("value", format_identifier(name, value));
  613. generator.append(R"~~~(
  614. @value@,)~~~");
  615. }
  616. generator.append(R"~~~(
  617. };
  618. )~~~");
  619. };
  620. generator.append(R"~~~(
  621. #pragma once
  622. #include <AK/Optional.h>
  623. #include <AK/StringView.h>
  624. #include <AK/Types.h>
  625. #include <AK/Vector.h>
  626. #include <LibUnicode/Forward.h>
  627. namespace Unicode {
  628. )~~~");
  629. auto locales = locale_data.locales.keys();
  630. generate_enum("Locale"sv, "None"sv, locales);
  631. generate_enum("Language"sv, {}, locale_data.languages);
  632. generate_enum("Territory"sv, {}, locale_data.territories);
  633. generate_enum("ScriptTag"sv, {}, locale_data.scripts);
  634. generate_enum("Currency"sv, {}, locale_data.currencies);
  635. generate_enum("Key"sv, {}, locale_data.keywords);
  636. generate_enum("NumericSymbol"sv, {}, locale_data.numeric_symbols);
  637. generate_enum("Variant"sv, {}, locale_data.variants);
  638. generate_enum("ListPatternType"sv, {}, locale_data.list_pattern_types);
  639. generate_enum("ListPatternStyle"sv, {}, locale_data.list_pattern_styles);
  640. generator.append(R"~~~(
  641. namespace Detail {
  642. Optional<Locale> locale_from_string(StringView locale);
  643. Optional<StringView> get_locale_language_mapping(StringView locale, StringView language);
  644. Optional<Language> language_from_string(StringView language);
  645. Optional<StringView> resolve_language_alias(StringView language);
  646. Optional<StringView> get_locale_territory_mapping(StringView locale, StringView territory);
  647. Optional<Territory> territory_from_string(StringView territory);
  648. Optional<StringView> resolve_territory_alias(StringView territory);
  649. Optional<StringView> get_locale_script_tag_mapping(StringView locale, StringView script_tag);
  650. Optional<ScriptTag> script_tag_from_string(StringView script_tag);
  651. Optional<StringView> resolve_script_tag_alias(StringView script_tag);
  652. Optional<StringView> get_locale_currency_mapping(StringView locale, StringView currency);
  653. Optional<Currency> currency_from_string(StringView currency);
  654. Optional<StringView> get_locale_key_mapping(StringView locale, StringView key);
  655. Optional<Key> key_from_string(StringView key);
  656. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, StringView numeric_symbol);
  657. Optional<NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type);
  658. Vector<NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type);
  659. Optional<NumericSymbol> numeric_symbol_from_string(StringView numeric_symbol);
  660. Optional<ListPatterns> get_locale_list_pattern_mapping(StringView locale, StringView list_pattern_type, StringView list_pattern_style);
  661. Optional<ListPatternType> list_pattern_type_from_string(StringView list_pattern_type);
  662. Optional<ListPatternStyle> list_pattern_style_from_string(StringView list_pattern_style);
  663. Optional<StringView> resolve_variant_alias(StringView variant);
  664. Optional<StringView> resolve_subdivision_alias(StringView subdivision);
  665. void resolve_complex_language_aliases(Unicode::LanguageID& language_id);
  666. Optional<Unicode::LanguageID> add_likely_subtags(Unicode::LanguageID const& language_id);
  667. Optional<String> resolve_most_likely_territory(Unicode::LanguageID const& language_id);
  668. }
  669. }
  670. )~~~");
  671. file.write(generator.as_string_view());
  672. }
  673. static void generate_unicode_locale_implementation(Core::File& file, UnicodeLocaleData& locale_data)
  674. {
  675. StringBuilder builder;
  676. SourceGenerator generator { builder };
  677. generator.set("string_index_type"sv, s_string_index_type);
  678. generator.set("strings_size"sv, String::number(locale_data.unique_strings.size()));
  679. generator.set("locales_size"sv, String::number(locale_data.locales.size()));
  680. generator.set("territories_size", String::number(locale_data.territories.size()));
  681. generator.set("variants_size", String::number(locale_data.max_variant_size));
  682. generator.set("numeric_symbols_size", String::number(locale_data.numeric_symbols.size()));
  683. generator.append(R"~~~(
  684. #include <AK/Array.h>
  685. #include <AK/BinarySearch.h>
  686. #include <AK/Span.h>
  687. #include <LibUnicode/Locale.h>
  688. #include <LibUnicode/UnicodeLocale.h>
  689. namespace Unicode::Detail {
  690. struct Patterns {
  691. ListPatternType type;
  692. ListPatternStyle style;
  693. @string_index_type@ start { 0 };
  694. @string_index_type@ middle { 0 };
  695. @string_index_type@ end { 0 };
  696. @string_index_type@ pair { 0 };
  697. };
  698. )~~~");
  699. generator.append(R"~~~(
  700. static constexpr Array<StringView, @strings_size@ + 1> s_string_list { {
  701. {})~~~");
  702. constexpr size_t max_strings_per_row = 30;
  703. size_t strings_in_current_row = 1;
  704. for (auto const& string : locale_data.unique_strings) {
  705. if (strings_in_current_row++ > 0)
  706. generator.append(", ");
  707. generator.append(String::formatted("\"{}\"sv", string));
  708. if (strings_in_current_row == max_strings_per_row) {
  709. strings_in_current_row = 0;
  710. generator.append(",\n ");
  711. }
  712. }
  713. generator.append(R"~~~(
  714. } };
  715. struct NumberFormat {
  716. Unicode::NumberFormat to_unicode_number_format() const {
  717. Unicode::NumberFormat number_format {};
  718. number_format.magnitude = magnitude;
  719. number_format.plurality = static_cast<Unicode::NumberFormat::Plurality>(plurality);
  720. number_format.format = s_string_list[format];
  721. return number_format;
  722. }
  723. u8 magnitude { 0 };
  724. u8 plurality { 0 };
  725. @string_index_type@ format { 0 };
  726. };
  727. struct NumberSystem {
  728. @string_index_type@ system { 0 };
  729. Array<@string_index_type@, @numeric_symbols_size@> symbols {};
  730. Span<NumberFormat const> decimal_long_formats {};
  731. Span<NumberFormat const> decimal_short_formats {};
  732. Span<NumberFormat const> currency_short_formats {};
  733. NumberFormat percent_format {};
  734. };
  735. )~~~");
  736. auto format_mapping_name = [](StringView format, StringView name) {
  737. auto mapping_name = name.to_lowercase_string().replace("-"sv, "_"sv, true);
  738. return String::formatted(format, mapping_name);
  739. };
  740. auto append_index = [&](auto index) {
  741. generator.append(String::formatted(", {}", index));
  742. };
  743. auto append_list_and_size = [&](auto const& list) {
  744. if (list.is_empty()) {
  745. generator.append(", {}, 0");
  746. return;
  747. }
  748. bool first = true;
  749. generator.append(", {");
  750. for (auto const& item : list) {
  751. generator.append(first ? " " : ", ");
  752. generator.append(String::number(item));
  753. first = false;
  754. }
  755. generator.append(String::formatted(" }}, {}", list.size()));
  756. };
  757. auto append_string_index_list = [&](String name, auto const& keys, auto const& mappings) {
  758. generator.set("name", name);
  759. generator.set("size", String::number(keys.size()));
  760. generator.append(R"~~~(
  761. static constexpr Array<@string_index_type@, @size@> @name@ { {
  762. )~~~");
  763. constexpr size_t max_values_per_row = 30;
  764. size_t values_in_current_row = 0;
  765. for (auto const& key : keys) {
  766. if (values_in_current_row++ > 0)
  767. generator.append(" ");
  768. if (auto it = mappings.find(key); it != mappings.end())
  769. generator.set("mapping"sv, String::number(it->value));
  770. else
  771. generator.set("mapping"sv, "0"sv);
  772. generator.append("@mapping@,");
  773. if (values_in_current_row == max_values_per_row) {
  774. values_in_current_row = 0;
  775. generator.append("\n ");
  776. }
  777. }
  778. generator.append(R"~~~(
  779. } };
  780. )~~~");
  781. };
  782. auto append_number_format = [&](auto const& number_format) {
  783. generator.set("magnitude"sv, String::number(number_format.magnitude));
  784. generator.set("plurality"sv, String::number(static_cast<u8>(number_format.plurality)));
  785. generator.set("format"sv, String::number(number_format.format_index));
  786. generator.append("{ @magnitude@, @plurality@, @format@ },");
  787. };
  788. auto append_number_formats = [&](String name, auto const& number_formats) {
  789. generator.set("name"sv, move(name));
  790. generator.set("size"sv, String::number(number_formats.size()));
  791. generator.append(R"~~~(
  792. static constexpr Array<NumberFormat, @size@> @name@ { {
  793. )~~~");
  794. constexpr size_t max_values_per_row = 10;
  795. size_t values_in_current_row = 0;
  796. for (auto const& number_format : number_formats) {
  797. if (values_in_current_row++ > 0)
  798. generator.append(" ");
  799. append_number_format(number_format);
  800. if (values_in_current_row == max_values_per_row) {
  801. values_in_current_row = 0;
  802. generator.append("\n ");
  803. }
  804. }
  805. generator.append(R"~~~(
  806. } };
  807. )~~~");
  808. };
  809. auto append_number_systems = [&](String name, auto const& number_systems) {
  810. auto format_name = [&](StringView system, StringView format) {
  811. return String::formatted("{}_{}_{}", name, system, format);
  812. };
  813. for (auto const& number_system : number_systems) {
  814. append_number_formats(format_name(number_system.key, "dl"sv), number_system.value.decimal_long_formats);
  815. append_number_formats(format_name(number_system.key, "ds"sv), number_system.value.decimal_short_formats);
  816. append_number_formats(format_name(number_system.key, "cs"sv), number_system.value.currency_short_formats);
  817. }
  818. generator.set("name", name);
  819. generator.set("size", String::number(number_systems.size()));
  820. generator.append(R"~~~(
  821. static constexpr Array<NumberSystem, @size@> @name@ { {)~~~");
  822. for (auto const& number_system : number_systems) {
  823. generator.set("system"sv, String::number(number_system.value.system));
  824. generator.set("decimal_long_formats"sv, format_name(number_system.key, "dl"sv));
  825. generator.set("decimal_short_formats"sv, format_name(number_system.key, "ds"sv));
  826. generator.set("currency_short_formats"sv, format_name(number_system.key, "cs"sv));
  827. generator.append(R"~~~(
  828. { @system@, {)~~~");
  829. for (auto const& symbol : locale_data.numeric_symbols) {
  830. auto index = number_system.value.symbols.get(symbol).value_or(0);
  831. generator.set("index", String::number(index));
  832. generator.append(" @index@,");
  833. }
  834. generator.append(" }, @decimal_long_formats@.span(), @decimal_short_formats@.span(), @currency_short_formats@.span(), ");
  835. append_number_format(number_system.value.percent_format);
  836. generator.append(" },");
  837. }
  838. generator.append(R"~~~(
  839. } };
  840. )~~~");
  841. };
  842. auto append_list_patterns = [&](StringView name, Vector<ListPatterns> const& list_patterns) {
  843. generator.set("name", name);
  844. generator.set("size", String::number(list_patterns.size()));
  845. generator.append(R"~~~(
  846. static constexpr Array<Patterns, @size@> @name@ { {)~~~");
  847. for (auto const& list_pattern : list_patterns) {
  848. generator.set("type"sv, String::formatted("ListPatternType::{}", format_identifier({}, list_pattern.type)));
  849. generator.set("style"sv, String::formatted("ListPatternStyle::{}", format_identifier({}, list_pattern.style)));
  850. generator.set("start"sv, String::number(list_pattern.start));
  851. generator.set("middle"sv, String::number(list_pattern.middle));
  852. generator.set("end"sv, String::number(list_pattern.end));
  853. generator.set("pair"sv, String::number(list_pattern.pair));
  854. generator.append(R"~~~(
  855. { @type@, @style@, @start@, @middle@, @end@, @pair@ },)~~~");
  856. }
  857. generator.append(R"~~~(
  858. } };
  859. )~~~");
  860. };
  861. auto append_mapping = [&](StringView type, StringView name, StringView format, auto format_list_callback) {
  862. Vector<String> mapping_names;
  863. for (auto const& locale : locale_data.locales) {
  864. auto mapping_name = format_mapping_name(format, locale.key);
  865. format_list_callback(mapping_name, locale.value);
  866. mapping_names.append(move(mapping_name));
  867. }
  868. quick_sort(mapping_names);
  869. generator.set("type", type);
  870. generator.set("name", name);
  871. generator.set("size", String::number(locale_data.locales.size()));
  872. generator.append(R"~~~(
  873. static constexpr Array<Span<@type@ const>, @size@> @name@ { {
  874. )~~~");
  875. constexpr size_t max_values_per_row = 10;
  876. size_t values_in_current_row = 0;
  877. for (auto& mapping_name : mapping_names) {
  878. if (values_in_current_row++ > 0)
  879. generator.append(" ");
  880. generator.set("name", move(mapping_name));
  881. generator.append("@name@.span(),");
  882. if (values_in_current_row == max_values_per_row) {
  883. values_in_current_row = 0;
  884. generator.append("\n ");
  885. }
  886. }
  887. generator.append(R"~~~(
  888. } };
  889. )~~~");
  890. };
  891. append_mapping(s_string_index_type, "s_languages"sv, "s_languages_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.languages, value.languages); });
  892. append_mapping(s_string_index_type, "s_territories"sv, "s_territories_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.territories, value.territories); });
  893. append_mapping(s_string_index_type, "s_scripts"sv, "s_scripts_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.scripts, value.scripts); });
  894. append_mapping(s_string_index_type, "s_currencies"sv, "s_currencies_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.currencies, value.currencies); });
  895. append_mapping(s_string_index_type, "s_keywords"sv, "s_keywords_{}", [&](auto const& name, auto const& value) { append_string_index_list(name, locale_data.keywords, value.keywords); });
  896. append_mapping("NumberSystem"sv, "s_number_systems"sv, "s_number_systems_{}", [&](auto const& name, auto const& value) { append_number_systems(name, value.number_systems); });
  897. append_mapping("Patterns"sv, "s_list_patterns"sv, "s_list_patterns_{}", [&](auto const& name, auto const& value) { append_list_patterns(name, value.list_patterns); });
  898. generator.append(R"~~~(
  899. struct CanonicalLanguageID {
  900. Unicode::LanguageID to_unicode_language_id() const
  901. {
  902. Unicode::LanguageID language_id {};
  903. language_id.variants.ensure_capacity(variants_size);
  904. language_id.language = s_string_list[language];
  905. if (script != 0)
  906. language_id.script = s_string_list[script];
  907. if (region != 0)
  908. language_id.region = s_string_list[region];
  909. for (size_t i = 0; i < variants_size; ++i)
  910. language_id.variants.append(s_string_list[variants[i]]);
  911. return language_id;
  912. }
  913. bool matches_variants(Vector<String> const& other_variants) const {
  914. if (variants_size == 0)
  915. return true;
  916. if (other_variants.size() != variants_size)
  917. return false;
  918. for (size_t i = 0; i < variants_size; ++i) {
  919. if (s_string_list[variants[i]] != other_variants[i])
  920. return false;
  921. }
  922. return true;
  923. };
  924. @string_index_type@ language { 0 };
  925. @string_index_type@ script { 0 };
  926. @string_index_type@ region { 0 };
  927. Array<@string_index_type@, @variants_size@> variants {};
  928. size_t variants_size { 0 };
  929. };
  930. struct LanguageMapping {
  931. CanonicalLanguageID key;
  932. CanonicalLanguageID alias;
  933. };
  934. )~~~");
  935. auto append_complex_mapping = [&](StringView name, auto& mappings) {
  936. generator.set("size", String::number(mappings.size()));
  937. generator.set("name"sv, name);
  938. generator.append(R"~~~(
  939. static constexpr Array<LanguageMapping, @size@> s_@name@ { {
  940. )~~~");
  941. quick_sort(mappings, [&](auto const& lhs, auto const& rhs) {
  942. auto const& lhs_language = get_unique_string(locale_data, lhs.key.language);
  943. auto const& rhs_language = get_unique_string(locale_data, rhs.key.language);
  944. // Sort the keys such that "und" language tags are at the end, as those are less specific.
  945. if (lhs_language.starts_with("und"sv) && !rhs_language.starts_with("und"sv))
  946. return false;
  947. if (!lhs_language.starts_with("und"sv) && rhs_language.starts_with("und"sv))
  948. return true;
  949. return lhs_language < rhs_language;
  950. });
  951. for (auto const& mapping : mappings) {
  952. generator.set("language"sv, String::number(mapping.key.language));
  953. generator.append(" { { @language@");
  954. append_index(mapping.key.script);
  955. append_index(mapping.key.region);
  956. append_list_and_size(mapping.key.variants);
  957. generator.set("language"sv, String::number(mapping.alias.language));
  958. generator.append(" }, { @language@");
  959. append_index(mapping.alias.script);
  960. append_index(mapping.alias.region);
  961. append_list_and_size(mapping.alias.variants);
  962. generator.append(" } },\n");
  963. }
  964. generator.append("} };\n");
  965. };
  966. append_complex_mapping("complex_alias"sv, locale_data.complex_mappings);
  967. append_complex_mapping("likely_subtags"sv, locale_data.likely_subtags);
  968. generator.append(R"~~~(
  969. static LanguageMapping const* resolve_likely_subtag(Unicode::LanguageID const& language_id)
  970. {
  971. // https://unicode.org/reports/tr35/#Likely_Subtags
  972. enum class State {
  973. LanguageScriptRegion,
  974. LanguageRegion,
  975. LanguageScript,
  976. Language,
  977. UndScript,
  978. Done,
  979. };
  980. auto state = State::LanguageScriptRegion;
  981. while (state != State::Done) {
  982. Unicode::LanguageID search_key;
  983. switch (state) {
  984. case State::LanguageScriptRegion:
  985. state = State::LanguageRegion;
  986. if (!language_id.script.has_value() || !language_id.region.has_value())
  987. continue;
  988. search_key.language = *language_id.language;
  989. search_key.script = *language_id.script;
  990. search_key.region = *language_id.region;
  991. break;
  992. case State::LanguageRegion:
  993. state = State::LanguageScript;
  994. if (!language_id.region.has_value())
  995. continue;
  996. search_key.language = *language_id.language;
  997. search_key.region = *language_id.region;
  998. break;
  999. case State::LanguageScript:
  1000. state = State::Language;
  1001. if (!language_id.script.has_value())
  1002. continue;
  1003. search_key.language = *language_id.language;
  1004. search_key.script = *language_id.script;
  1005. break;
  1006. case State::Language:
  1007. state = State::UndScript;
  1008. search_key.language = *language_id.language;
  1009. break;
  1010. case State::UndScript:
  1011. state = State::Done;
  1012. if (!language_id.script.has_value())
  1013. continue;
  1014. search_key.language = "und"sv;
  1015. search_key.script = *language_id.script;
  1016. break;
  1017. default:
  1018. VERIFY_NOT_REACHED();
  1019. }
  1020. for (auto const& map : s_likely_subtags) {
  1021. auto const& key_language = s_string_list[map.key.language];
  1022. auto const& key_script = s_string_list[map.key.script];
  1023. auto const& key_region = s_string_list[map.key.region];
  1024. if (key_language != search_key.language)
  1025. continue;
  1026. if (!key_script.is_empty() || search_key.script.has_value()) {
  1027. if (key_script != search_key.script)
  1028. continue;
  1029. }
  1030. if (!key_region.is_empty() || search_key.region.has_value()) {
  1031. if (key_region != search_key.region)
  1032. continue;
  1033. }
  1034. return &map;
  1035. }
  1036. }
  1037. return nullptr;
  1038. }
  1039. )~~~");
  1040. auto append_mapping_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) {
  1041. generator.set("enum_title", enum_title);
  1042. generator.set("enum_snake", enum_snake);
  1043. generator.set("collection_name", collection_name);
  1044. generator.append(R"~~~(
  1045. Optional<StringView> get_locale_@enum_snake@_mapping(StringView locale, StringView @enum_snake@)
  1046. {
  1047. auto locale_value = locale_from_string(locale);
  1048. if (!locale_value.has_value())
  1049. return {};
  1050. auto @enum_snake@_value = @enum_snake@_from_string(@enum_snake@);
  1051. if (!@enum_snake@_value.has_value())
  1052. return {};
  1053. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  1054. auto @enum_snake@_index = to_underlying(*@enum_snake@_value);
  1055. auto const& mappings = @collection_name@.at(locale_index);
  1056. auto @enum_snake@_string_index = mappings.at(@enum_snake@_index);
  1057. auto @enum_snake@_mapping = s_string_list.at(@enum_snake@_string_index);
  1058. if (@enum_snake@_mapping.is_empty())
  1059. return {};
  1060. return @enum_snake@_mapping;
  1061. }
  1062. )~~~");
  1063. };
  1064. auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values) {
  1065. HashValueMap<String> hashes;
  1066. hashes.ensure_capacity(values.size());
  1067. for (auto const& value : values)
  1068. hashes.set(value.hash(), format_identifier(enum_title, value));
  1069. generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes));
  1070. };
  1071. auto append_alias_search = [&](StringView enum_snake, auto const& aliases) {
  1072. HashValueMap<StringIndexType> hashes;
  1073. hashes.ensure_capacity(aliases.size());
  1074. for (auto const& alias : aliases)
  1075. hashes.set(alias.key.hash(), alias.value);
  1076. generate_value_from_string(generator, "resolve_{}_alias"sv, s_string_index_type, enum_snake, move(hashes), "StringView"sv, "s_string_list[{}]"sv);
  1077. };
  1078. append_from_string("Locale"sv, "locale"sv, locale_data.locales.keys());
  1079. append_mapping_search("Language"sv, "language"sv, "s_languages"sv);
  1080. append_from_string("Language"sv, "language"sv, locale_data.languages);
  1081. append_alias_search("language"sv, locale_data.language_aliases);
  1082. append_mapping_search("Territory"sv, "territory"sv, "s_territories"sv);
  1083. append_from_string("Territory"sv, "territory"sv, locale_data.territories);
  1084. append_alias_search("territory"sv, locale_data.territory_aliases);
  1085. append_mapping_search("ScriptTag"sv, "script_tag"sv, "s_scripts"sv);
  1086. append_from_string("ScriptTag"sv, "script_tag"sv, locale_data.scripts);
  1087. append_alias_search("script_tag"sv, locale_data.script_aliases);
  1088. append_mapping_search("Currency"sv, "currency"sv, "s_currencies"sv);
  1089. append_from_string("Currency"sv, "currency"sv, locale_data.currencies);
  1090. append_mapping_search("Key"sv, "key"sv, "s_keywords"sv);
  1091. append_from_string("Key"sv, "key"sv, locale_data.keywords);
  1092. append_alias_search("variant"sv, locale_data.variant_aliases);
  1093. append_alias_search("subdivision"sv, locale_data.subdivision_aliases);
  1094. append_from_string("ListPatternType"sv, "list_pattern_type"sv, locale_data.list_pattern_types);
  1095. append_from_string("ListPatternStyle"sv, "list_pattern_style"sv, locale_data.list_pattern_styles);
  1096. append_from_string("NumericSymbol"sv, "numeric_symbol"sv, locale_data.numeric_symbols);
  1097. generator.append(R"~~~(
  1098. static NumberSystem const* find_number_system(StringView locale, StringView system)
  1099. {
  1100. auto locale_value = locale_from_string(locale);
  1101. if (!locale_value.has_value())
  1102. return nullptr;
  1103. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  1104. auto const& number_systems = s_number_systems.at(locale_index);
  1105. for (auto const& number_system : number_systems) {
  1106. if (system == s_string_list[number_system.system])
  1107. return &number_system;
  1108. };
  1109. return nullptr;
  1110. }
  1111. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, StringView symbol)
  1112. {
  1113. auto symbol_value = numeric_symbol_from_string(symbol);
  1114. if (!symbol_value.has_value())
  1115. return {};
  1116. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  1117. auto symbol_index = to_underlying(*symbol_value);
  1118. return s_string_list[number_system->symbols[symbol_index]];
  1119. }
  1120. return {};
  1121. }
  1122. Optional<Unicode::NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type)
  1123. {
  1124. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  1125. switch (type) {
  1126. case StandardNumberFormatType::Percent:
  1127. return number_system->percent_format.to_unicode_number_format();
  1128. }
  1129. }
  1130. return {};
  1131. }
  1132. Vector<Unicode::NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type)
  1133. {
  1134. Vector<Unicode::NumberFormat> formats;
  1135. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  1136. Span<NumberFormat const> number_formats;
  1137. switch (type) {
  1138. case CompactNumberFormatType::DecimalLong:
  1139. number_formats = number_system->decimal_long_formats;
  1140. break;
  1141. case CompactNumberFormatType::DecimalShort:
  1142. number_formats = number_system->decimal_short_formats;
  1143. break;
  1144. case CompactNumberFormatType::CurrencyShort:
  1145. number_formats = number_system->currency_short_formats;
  1146. break;
  1147. }
  1148. formats.ensure_capacity(number_formats.size());
  1149. for (auto const& number_format : number_formats)
  1150. formats.append(number_format.to_unicode_number_format());
  1151. }
  1152. return formats;
  1153. }
  1154. Optional<ListPatterns> get_locale_list_pattern_mapping(StringView locale, StringView list_pattern_type, StringView list_pattern_style)
  1155. {
  1156. auto locale_value = locale_from_string(locale);
  1157. if (!locale_value.has_value())
  1158. return {};
  1159. auto type_value = list_pattern_type_from_string(list_pattern_type);
  1160. if (!type_value.has_value())
  1161. return {};
  1162. auto style_value = list_pattern_style_from_string(list_pattern_style);
  1163. if (!style_value.has_value())
  1164. return {};
  1165. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  1166. auto const& locale_list_patterns = s_list_patterns.at(locale_index);
  1167. for (auto const& list_patterns : locale_list_patterns) {
  1168. if ((list_patterns.type == type_value) && (list_patterns.style == style_value)) {
  1169. auto const& start = s_string_list[list_patterns.start];
  1170. auto const& middle = s_string_list[list_patterns.middle];
  1171. auto const& end = s_string_list[list_patterns.end];
  1172. auto const& pair = s_string_list[list_patterns.pair];
  1173. return ListPatterns { start, middle, end, pair };
  1174. }
  1175. }
  1176. return {};
  1177. }
  1178. void resolve_complex_language_aliases(Unicode::LanguageID& language_id)
  1179. {
  1180. for (auto const& map : s_complex_alias) {
  1181. auto const& key_language = s_string_list[map.key.language];
  1182. auto const& key_script = s_string_list[map.key.script];
  1183. auto const& key_region = s_string_list[map.key.region];
  1184. if ((key_language != language_id.language) && (key_language != "und"sv))
  1185. continue;
  1186. if (!key_script.is_empty() && (key_script != language_id.script))
  1187. continue;
  1188. if (!key_region.is_empty() && (key_region != language_id.region))
  1189. continue;
  1190. if (!map.key.matches_variants(language_id.variants))
  1191. continue;
  1192. auto alias = map.alias.to_unicode_language_id();
  1193. if (alias.language == "und"sv)
  1194. alias.language = move(language_id.language);
  1195. if (key_script.is_empty() && !alias.script.has_value())
  1196. alias.script = move(language_id.script);
  1197. if (key_region.is_empty() && !alias.region.has_value())
  1198. alias.region = move(language_id.region);
  1199. if (map.key.variants_size == 0 && alias.variants.is_empty())
  1200. alias.variants = move(language_id.variants);
  1201. language_id = move(alias);
  1202. break;
  1203. }
  1204. }
  1205. Optional<Unicode::LanguageID> add_likely_subtags(Unicode::LanguageID const& language_id)
  1206. {
  1207. // https://www.unicode.org/reports/tr35/#Likely_Subtags
  1208. auto const* likely_subtag = resolve_likely_subtag(language_id);
  1209. if (likely_subtag == nullptr)
  1210. return {};
  1211. auto maximized = language_id;
  1212. auto const& key_script = s_string_list[likely_subtag->key.script];
  1213. auto const& key_region = s_string_list[likely_subtag->key.region];
  1214. auto const& alias_language = s_string_list[likely_subtag->alias.language];
  1215. auto const& alias_script = s_string_list[likely_subtag->alias.script];
  1216. auto const& alias_region = s_string_list[likely_subtag->alias.region];
  1217. if (maximized.language == "und"sv)
  1218. maximized.language = alias_language;
  1219. if (!maximized.script.has_value() || (!key_script.is_empty() && !alias_script.is_empty()))
  1220. maximized.script = alias_script;
  1221. if (!maximized.region.has_value() || (!key_region.is_empty() && !alias_region.is_empty()))
  1222. maximized.region = alias_region;
  1223. return maximized;
  1224. }
  1225. Optional<String> resolve_most_likely_territory(Unicode::LanguageID const& language_id)
  1226. {
  1227. if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr)
  1228. return s_string_list[likely_subtag->alias.region];
  1229. return {};
  1230. }
  1231. }
  1232. )~~~");
  1233. file.write(generator.as_string_view());
  1234. }
  1235. int main(int argc, char** argv)
  1236. {
  1237. char const* generated_header_path = nullptr;
  1238. char const* generated_implementation_path = nullptr;
  1239. char const* core_path = nullptr;
  1240. char const* locale_names_path = nullptr;
  1241. char const* misc_path = nullptr;
  1242. char const* numbers_path = nullptr;
  1243. Core::ArgsParser args_parser;
  1244. args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
  1245. args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
  1246. args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path");
  1247. args_parser.add_option(locale_names_path, "Path to cldr-localenames directory", "locale-names-path", 'l', "locale-names-path");
  1248. args_parser.add_option(misc_path, "Path to cldr-misc directory", "misc-path", 'm', "misc-path");
  1249. args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
  1250. args_parser.parse(argc, argv);
  1251. auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) {
  1252. if (path.is_empty()) {
  1253. warnln("{} is required", flags);
  1254. args_parser.print_usage(stderr, argv[0]);
  1255. exit(1);
  1256. }
  1257. auto file_or_error = Core::File::open(path, mode);
  1258. if (file_or_error.is_error()) {
  1259. warnln("Failed to open {}: {}", path, file_or_error.release_error());
  1260. exit(1);
  1261. }
  1262. return file_or_error.release_value();
  1263. };
  1264. auto generated_header_file = open_file(generated_header_path, "-h/--generated-header-path", Core::OpenMode::ReadWrite);
  1265. auto generated_implementation_file = open_file(generated_implementation_path, "-c/--generated-implementation-path", Core::OpenMode::ReadWrite);
  1266. UnicodeLocaleData locale_data;
  1267. parse_all_locales(core_path, locale_names_path, misc_path, numbers_path, locale_data);
  1268. generate_unicode_locale_header(generated_header_file, locale_data);
  1269. generate_unicode_locale_implementation(generated_implementation_file, locale_data);
  1270. return 0;
  1271. }