GenerateUnicodeNumberFormat.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "GeneratorUtil.h"
  7. #include <AK/AllOf.h>
  8. #include <AK/Array.h>
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Find.h>
  11. #include <AK/Format.h>
  12. #include <AK/HashMap.h>
  13. #include <AK/JsonObject.h>
  14. #include <AK/JsonParser.h>
  15. #include <AK/JsonValue.h>
  16. #include <AK/LexicalPath.h>
  17. #include <AK/QuickSort.h>
  18. #include <AK/SourceGenerator.h>
  19. #include <AK/String.h>
  20. #include <AK/StringBuilder.h>
  21. #include <AK/Utf8View.h>
  22. #include <LibCore/ArgsParser.h>
  23. #include <LibCore/DirIterator.h>
  24. #include <LibCore/File.h>
  25. #include <LibUnicode/Locale.h>
  26. #include <math.h>
  27. using StringIndexType = u16;
  28. constexpr auto s_string_index_type = "u16"sv;
  29. enum class NumberFormatType {
  30. Standard,
  31. Compact,
  32. };
  33. struct NumberFormat : public Unicode::NumberFormat {
  34. using Base = Unicode::NumberFormat;
  35. static Base::Plurality plurality_from_string(StringView plurality)
  36. {
  37. if (plurality == "other"sv)
  38. return Base::Plurality::Other;
  39. if (plurality == "1"sv)
  40. return Base::Plurality::Single;
  41. if (plurality == "zero"sv)
  42. return Base::Plurality::Zero;
  43. if (plurality == "one"sv)
  44. return Base::Plurality::One;
  45. if (plurality == "two"sv)
  46. return Base::Plurality::Two;
  47. if (plurality == "few"sv)
  48. return Base::Plurality::Few;
  49. if (plurality == "many"sv)
  50. return Base::Plurality::Many;
  51. VERIFY_NOT_REACHED();
  52. }
  53. StringIndexType zero_format_index { 0 };
  54. StringIndexType positive_format_index { 0 };
  55. StringIndexType negative_format_index { 0 };
  56. Vector<StringIndexType> identifier_indices {};
  57. };
  58. struct NumberSystem {
  59. StringIndexType system { 0 };
  60. HashMap<String, StringIndexType> symbols {};
  61. u8 primary_grouping_size { 0 };
  62. u8 secondary_grouping_size { 0 };
  63. NumberFormat decimal_format {};
  64. Vector<NumberFormat> decimal_long_formats {};
  65. Vector<NumberFormat> decimal_short_formats {};
  66. NumberFormat currency_format {};
  67. NumberFormat accounting_format {};
  68. Vector<NumberFormat> currency_unit_formats {};
  69. Vector<NumberFormat> currency_short_formats {};
  70. NumberFormat percent_format {};
  71. NumberFormat scientific_format {};
  72. };
  73. struct Unit {
  74. StringIndexType unit { 0 };
  75. Vector<NumberFormat> long_formats {};
  76. Vector<NumberFormat> short_formats {};
  77. Vector<NumberFormat> narrow_formats {};
  78. };
  79. struct Locale {
  80. HashMap<String, NumberSystem> number_systems;
  81. HashMap<String, Unit> units {};
  82. };
  83. struct UnicodeLocaleData {
  84. UniqueStringStorage<StringIndexType> unique_strings;
  85. HashMap<String, Locale> locales;
  86. Vector<String> numeric_symbols;
  87. size_t max_identifier_count { 0 };
  88. };
  89. static String parse_identifiers(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format)
  90. {
  91. static Utf8View whitespace { "\u0020\u00a0\u200f"sv };
  92. while (true) {
  93. Utf8View utf8_pattern { pattern };
  94. Optional<size_t> start_index;
  95. Optional<size_t> end_index;
  96. bool inside_replacement = false;
  97. for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
  98. if (*it == '{') {
  99. if (start_index.has_value()) {
  100. end_index = utf8_pattern.byte_offset_of(it);
  101. break;
  102. }
  103. inside_replacement = true;
  104. } else if (*it == '}') {
  105. inside_replacement = false;
  106. } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) {
  107. start_index = utf8_pattern.byte_offset_of(it);
  108. }
  109. }
  110. if (!start_index.has_value())
  111. return pattern;
  112. end_index = end_index.value_or(pattern.length());
  113. utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index);
  114. utf8_pattern = utf8_pattern.trim(whitespace);
  115. auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv);
  116. auto identifier_index = locale_data.unique_strings.ensure(move(identifier));
  117. size_t replacement_index = 0;
  118. if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) {
  119. replacement_index = *index;
  120. } else {
  121. replacement_index = format.identifier_indices.size();
  122. format.identifier_indices.append(identifier_index);
  123. locale_data.max_identifier_count = max(locale_data.max_identifier_count, format.identifier_indices.size());
  124. }
  125. pattern = String::formatted("{}{{{}:{}}}{}",
  126. *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv,
  127. replacement,
  128. replacement_index,
  129. pattern.substring_view(*start_index + utf8_pattern.byte_length()));
  130. }
  131. }
  132. static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
  133. {
  134. // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
  135. // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
  136. VERIFY((patterns.size() == 1) || (patterns.size() == 2));
  137. auto replace_patterns = [&](String pattern) {
  138. static HashMap<StringView, StringView> replacements = {
  139. { "{0}"sv, "{number}"sv },
  140. { "{1}"sv, "{currency}"sv },
  141. { "%"sv, "{percentSign}"sv },
  142. { "+"sv, "{plusSign}"sv },
  143. { "-"sv, "{minusSign}"sv },
  144. { "¤"sv, "{currency}"sv }, // U+00A4 Currency Sign
  145. { "E"sv, "{scientificSeparator}"sv },
  146. };
  147. for (auto const& replacement : replacements)
  148. pattern = pattern.replace(replacement.key, replacement.value, true);
  149. if (auto start_number_index = pattern.find_any_of("#0"sv, String::SearchDirection::Forward); start_number_index.has_value()) {
  150. auto end_number_index = *start_number_index + 1;
  151. for (; end_number_index < pattern.length(); ++end_number_index) {
  152. auto ch = pattern[end_number_index];
  153. if ((ch != '#') && (ch != '0') && (ch != ',') && (ch != '.'))
  154. break;
  155. }
  156. if (number_system_for_groupings) {
  157. auto number_pattern = pattern.substring_view(*start_number_index, end_number_index - *start_number_index);
  158. auto group_separators = number_pattern.find_all(","sv);
  159. VERIFY((group_separators.size() == 1) || (group_separators.size() == 2));
  160. auto decimal = number_pattern.find('.');
  161. VERIFY(decimal.has_value());
  162. if (group_separators.size() == 1) {
  163. number_system_for_groupings->primary_grouping_size = *decimal - group_separators[0] - 1;
  164. number_system_for_groupings->secondary_grouping_size = number_system_for_groupings->primary_grouping_size;
  165. } else {
  166. number_system_for_groupings->primary_grouping_size = *decimal - group_separators[1] - 1;
  167. number_system_for_groupings->secondary_grouping_size = group_separators[1] - group_separators[0] - 1;
  168. }
  169. }
  170. pattern = String::formatted("{}{{number}}{}",
  171. *start_number_index > 0 ? pattern.substring_view(0, *start_number_index) : ""sv,
  172. pattern.substring_view(end_number_index));
  173. // This is specifically handled here rather than in the replacements HashMap above so
  174. // that we do not errantly replace zeroes in number patterns.
  175. if (pattern.contains(*replacements.get("E"sv)))
  176. pattern = pattern.replace("0"sv, "{scientificExponent}"sv);
  177. }
  178. if (type == NumberFormatType::Compact)
  179. return parse_identifiers(move(pattern), "compactIdentifier"sv, locale_data, format);
  180. return pattern;
  181. };
  182. auto zero_format = replace_patterns(move(patterns[0]));
  183. format.positive_format_index = locale_data.unique_strings.ensure(String::formatted("{{plusSign}}{}", zero_format));
  184. if (patterns.size() == 2) {
  185. auto negative_format = replace_patterns(move(patterns[1]));
  186. format.negative_format_index = locale_data.unique_strings.ensure(move(negative_format));
  187. } else {
  188. format.negative_format_index = locale_data.unique_strings.ensure(String::formatted("{{minusSign}}{}", zero_format));
  189. }
  190. format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format));
  191. }
  192. static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
  193. {
  194. LexicalPath numbers_path(move(locale_numbers_path));
  195. numbers_path = numbers_path.append("numbers.json"sv);
  196. VERIFY(Core::File::exists(numbers_path.string()));
  197. auto numbers_file_or_error = Core::File::open(numbers_path.string(), Core::OpenMode::ReadOnly);
  198. VERIFY(!numbers_file_or_error.is_error());
  199. auto numbers = JsonParser(numbers_file_or_error.value()->read_all()).parse();
  200. VERIFY(numbers.has_value());
  201. auto const& main_object = numbers->as_object().get("main"sv);
  202. auto const& locale_object = main_object.as_object().get(numbers_path.parent().basename());
  203. auto const& locale_numbers_object = locale_object.as_object().get("numbers"sv);
  204. auto ensure_number_system = [&](auto const& system) -> NumberSystem& {
  205. return locale.number_systems.ensure(system, [&]() {
  206. auto system_index = locale_data.unique_strings.ensure(system);
  207. return NumberSystem { .system = system_index };
  208. });
  209. };
  210. auto parse_number_format = [&](auto const& format_object) {
  211. Vector<NumberFormat> result;
  212. result.ensure_capacity(format_object.size());
  213. format_object.for_each_member([&](auto const& key, JsonValue const& value) {
  214. auto split_key = key.split_view('-');
  215. if (split_key.size() != 3)
  216. return;
  217. auto patterns = value.as_string().split(';');
  218. NumberFormat format {};
  219. if (auto type = split_key[0].template to_uint<u64>(); type.has_value()) {
  220. VERIFY(*type % 10 == 0);
  221. format.magnitude = static_cast<u8>(log10(*type));
  222. if (patterns[0] != "0"sv) {
  223. auto number_of_zeroes_in_pattern = patterns[0].count("0"sv);
  224. VERIFY(format.magnitude >= number_of_zeroes_in_pattern);
  225. format.exponent = format.magnitude + 1 - number_of_zeroes_in_pattern;
  226. }
  227. } else {
  228. VERIFY(split_key[0] == "unitPattern"sv);
  229. }
  230. format.plurality = NumberFormat::plurality_from_string(split_key[2]);
  231. parse_number_pattern(move(patterns), locale_data, NumberFormatType::Compact, format);
  232. result.append(move(format));
  233. });
  234. return result;
  235. };
  236. locale_numbers_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  237. constexpr auto symbols_prefix = "symbols-numberSystem-"sv;
  238. constexpr auto decimal_formats_prefix = "decimalFormats-numberSystem-"sv;
  239. constexpr auto currency_formats_prefix = "currencyFormats-numberSystem-"sv;
  240. constexpr auto percent_formats_prefix = "percentFormats-numberSystem-"sv;
  241. constexpr auto scientific_formats_prefix = "scientificFormats-numberSystem-"sv;
  242. if (key.starts_with(symbols_prefix)) {
  243. auto system = key.substring(symbols_prefix.length());
  244. auto& number_system = ensure_number_system(system);
  245. value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) {
  246. auto symbol_index = locale_data.unique_strings.ensure(localization.as_string());
  247. number_system.symbols.set(symbol, symbol_index);
  248. if (!locale_data.numeric_symbols.contains_slow(symbol))
  249. locale_data.numeric_symbols.append(symbol);
  250. });
  251. } else if (key.starts_with(decimal_formats_prefix)) {
  252. auto system = key.substring(decimal_formats_prefix.length());
  253. auto& number_system = ensure_number_system(system);
  254. auto format_object = value.as_object().get("standard"sv);
  255. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.decimal_format, &number_system);
  256. auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv);
  257. number_system.decimal_long_formats = parse_number_format(long_format.as_object());
  258. auto const& short_format = value.as_object().get("short"sv).as_object().get("decimalFormat"sv);
  259. number_system.decimal_short_formats = parse_number_format(short_format.as_object());
  260. } else if (key.starts_with(currency_formats_prefix)) {
  261. auto system = key.substring(currency_formats_prefix.length());
  262. auto& number_system = ensure_number_system(system);
  263. auto format_object = value.as_object().get("standard"sv);
  264. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.currency_format);
  265. format_object = value.as_object().get("accounting"sv);
  266. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.accounting_format);
  267. number_system.currency_unit_formats = parse_number_format(value.as_object());
  268. if (value.as_object().has("short"sv)) {
  269. auto const& short_format = value.as_object().get("short"sv).as_object().get("standard"sv);
  270. number_system.currency_short_formats = parse_number_format(short_format.as_object());
  271. }
  272. } else if (key.starts_with(percent_formats_prefix)) {
  273. auto system = key.substring(percent_formats_prefix.length());
  274. auto& number_system = ensure_number_system(system);
  275. auto format_object = value.as_object().get("standard"sv);
  276. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.percent_format);
  277. } else if (key.starts_with(scientific_formats_prefix)) {
  278. auto system = key.substring(scientific_formats_prefix.length());
  279. auto& number_system = ensure_number_system(system);
  280. auto format_object = value.as_object().get("standard"sv);
  281. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.scientific_format);
  282. }
  283. });
  284. }
  285. static void parse_units(String locale_units_path, UnicodeLocaleData& locale_data, Locale& locale)
  286. {
  287. LexicalPath units_path(move(locale_units_path));
  288. units_path = units_path.append("units.json"sv);
  289. VERIFY(Core::File::exists(units_path.string()));
  290. auto units_file_or_error = Core::File::open(units_path.string(), Core::OpenMode::ReadOnly);
  291. VERIFY(!units_file_or_error.is_error());
  292. auto units = JsonParser(units_file_or_error.value()->read_all()).parse();
  293. VERIFY(units.has_value());
  294. auto const& main_object = units->as_object().get("main"sv);
  295. auto const& locale_object = main_object.as_object().get(units_path.parent().basename());
  296. auto const& locale_units_object = locale_object.as_object().get("units"sv);
  297. auto const& long_object = locale_units_object.as_object().get("long"sv);
  298. auto const& short_object = locale_units_object.as_object().get("short"sv);
  299. auto const& narrow_object = locale_units_object.as_object().get("narrow"sv);
  300. auto ensure_unit = [&](auto const& unit) -> Unit& {
  301. return locale.units.ensure(unit, [&]() {
  302. auto unit_index = locale_data.unique_strings.ensure(unit);
  303. return Unit { .unit = unit_index };
  304. });
  305. };
  306. auto is_sanctioned_unit = [](StringView unit_name) {
  307. // This is a copy of the units sanctioned for use within ECMA-402. LibUnicode generally tries to
  308. // avoid being directly dependent on ECMA-402, but this rather significantly reduces the amount
  309. // of data generated here, and ECMA-402 is currently the only consumer of this data.
  310. // https://tc39.es/ecma402/#table-sanctioned-simple-unit-identifiers
  311. constexpr auto sanctioned_units = AK::Array { "acre"sv, "bit"sv, "byte"sv, "celsius"sv, "centimeter"sv, "day"sv, "degree"sv, "fahrenheit"sv, "fluid-ounce"sv, "foot"sv, "gallon"sv, "gigabit"sv, "gigabyte"sv, "gram"sv, "hectare"sv, "hour"sv, "inch"sv, "kilobit"sv, "kilobyte"sv, "kilogram"sv, "kilometer"sv, "liter"sv, "megabit"sv, "megabyte"sv, "meter"sv, "mile"sv, "mile-scandinavian"sv, "milliliter"sv, "millimeter"sv, "millisecond"sv, "minute"sv, "month"sv, "ounce"sv, "percent"sv, "petabyte"sv, "pound"sv, "second"sv, "stone"sv, "terabit"sv, "terabyte"sv, "week"sv, "yard"sv, "year"sv };
  312. return find(sanctioned_units.begin(), sanctioned_units.end(), unit_name) != sanctioned_units.end();
  313. };
  314. auto parse_units_object = [&](auto const& units_object, Unicode::Style style) {
  315. constexpr auto unit_pattern_prefix = "unitPattern-count-"sv;
  316. constexpr auto combined_unit_separator = "-per-"sv;
  317. units_object.for_each_member([&](auto const& key, JsonValue const& value) {
  318. auto end_of_category = key.find('-');
  319. if (!end_of_category.has_value())
  320. return;
  321. auto unit_name = key.substring(*end_of_category + 1);
  322. if (!is_sanctioned_unit(unit_name)) {
  323. auto indices = unit_name.find_all(combined_unit_separator);
  324. if (indices.size() != 1)
  325. return;
  326. auto numerator = unit_name.substring_view(0, indices[0]);
  327. auto denominator = unit_name.substring_view(indices[0] + combined_unit_separator.length());
  328. if (!is_sanctioned_unit(numerator) || !is_sanctioned_unit(denominator))
  329. return;
  330. }
  331. value.as_object().for_each_member([&](auto const& unit_key, JsonValue const& pattern_value) {
  332. if (!unit_key.starts_with(unit_pattern_prefix))
  333. return;
  334. auto& unit = ensure_unit(unit_name);
  335. NumberFormat format {};
  336. auto plurality = unit_key.substring_view(unit_pattern_prefix.length());
  337. format.plurality = NumberFormat::plurality_from_string(plurality);
  338. auto zero_format = pattern_value.as_string().replace("{0}"sv, "{number}"sv);
  339. zero_format = parse_identifiers(zero_format, "unitIdentifier"sv, locale_data, format);
  340. format.positive_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{plusSign}{number}"sv));
  341. format.negative_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{minusSign}{number}"sv));
  342. format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format));
  343. switch (style) {
  344. case Unicode::Style::Long:
  345. unit.long_formats.append(move(format));
  346. break;
  347. case Unicode::Style::Short:
  348. unit.short_formats.append(move(format));
  349. break;
  350. case Unicode::Style::Narrow:
  351. unit.narrow_formats.append(move(format));
  352. break;
  353. default:
  354. VERIFY_NOT_REACHED();
  355. }
  356. });
  357. });
  358. };
  359. parse_units_object(long_object.as_object(), Unicode::Style::Long);
  360. parse_units_object(short_object.as_object(), Unicode::Style::Short);
  361. parse_units_object(narrow_object.as_object(), Unicode::Style::Narrow);
  362. }
  363. static void parse_all_locales(String core_path, String numbers_path, String units_path, UnicodeLocaleData& locale_data)
  364. {
  365. auto numbers_iterator = path_to_dir_iterator(move(numbers_path));
  366. auto units_iterator = path_to_dir_iterator(move(units_path));
  367. auto remove_variants_from_path = [&](String path) -> Optional<String> {
  368. auto parsed_locale = CanonicalLanguageID<StringIndexType>::parse(locale_data.unique_strings, LexicalPath::basename(path));
  369. if (!parsed_locale.has_value())
  370. return {};
  371. StringBuilder builder;
  372. builder.append(locale_data.unique_strings.get(parsed_locale->language));
  373. if (auto script = locale_data.unique_strings.get(parsed_locale->script); !script.is_empty())
  374. builder.appendff("-{}", script);
  375. if (auto region = locale_data.unique_strings.get(parsed_locale->region); !region.is_empty())
  376. builder.appendff("-{}", region);
  377. return builder.build();
  378. };
  379. while (numbers_iterator.has_next()) {
  380. auto numbers_path = numbers_iterator.next_full_path();
  381. VERIFY(Core::File::is_directory(numbers_path));
  382. auto language = remove_variants_from_path(numbers_path);
  383. if (!language.has_value())
  384. continue;
  385. auto& locale = locale_data.locales.ensure(*language);
  386. parse_number_systems(numbers_path, locale_data, locale);
  387. }
  388. while (units_iterator.has_next()) {
  389. auto units_path = units_iterator.next_full_path();
  390. VERIFY(Core::File::is_directory(units_path));
  391. auto language = remove_variants_from_path(units_path);
  392. if (!language.has_value())
  393. continue;
  394. auto& locale = locale_data.locales.ensure(*language);
  395. parse_units(units_path, locale_data, locale);
  396. }
  397. parse_default_content_locales(move(core_path), locale_data);
  398. }
  399. static String format_identifier(StringView owner, String identifier)
  400. {
  401. identifier = identifier.replace("-"sv, "_"sv, true);
  402. if (all_of(identifier, is_ascii_digit))
  403. return String::formatted("{}_{}", owner[0], identifier);
  404. if (is_ascii_lower_alpha(identifier[0]))
  405. return String::formatted("{:c}{}", to_ascii_uppercase(identifier[0]), identifier.substring_view(1));
  406. return identifier;
  407. }
  408. static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData& locale_data)
  409. {
  410. StringBuilder builder;
  411. SourceGenerator generator { builder };
  412. generator.append(R"~~~(
  413. #pragma once
  414. #include <AK/Optional.h>
  415. #include <AK/StringView.h>
  416. #include <AK/Types.h>
  417. #include <AK/Vector.h>
  418. #include <LibUnicode/Forward.h>
  419. namespace Unicode {
  420. )~~~");
  421. generate_enum(generator, format_identifier, "NumericSymbol"sv, {}, locale_data.numeric_symbols);
  422. generator.append(R"~~~(
  423. namespace Detail {
  424. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, StringView numeric_symbol);
  425. Optional<NumberGroupings> get_number_system_groupings(StringView locale, StringView system);
  426. Optional<NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type);
  427. Vector<NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type);
  428. Vector<Unicode::NumberFormat> get_unit_formats(StringView locale, StringView unit, Style style);
  429. Optional<NumericSymbol> numeric_symbol_from_string(StringView numeric_symbol);
  430. }
  431. }
  432. )~~~");
  433. file.write(generator.as_string_view());
  434. }
  435. static void generate_unicode_locale_implementation(Core::File& file, UnicodeLocaleData& locale_data)
  436. {
  437. StringBuilder builder;
  438. SourceGenerator generator { builder };
  439. generator.set("string_index_type"sv, s_string_index_type);
  440. generator.set("numeric_symbols_size", String::number(locale_data.numeric_symbols.size()));
  441. generator.set("identifier_count", String::number(locale_data.max_identifier_count));
  442. generator.append(R"~~~(
  443. #include <AK/Array.h>
  444. #include <AK/BinarySearch.h>
  445. #include <AK/Span.h>
  446. #include <LibUnicode/Locale.h>
  447. #include <LibUnicode/UnicodeNumberFormat.h>
  448. namespace Unicode::Detail {
  449. )~~~");
  450. locale_data.unique_strings.generate(generator);
  451. generator.append(R"~~~(
  452. struct NumberFormat {
  453. Unicode::NumberFormat to_unicode_number_format() const {
  454. Unicode::NumberFormat number_format {};
  455. number_format.magnitude = magnitude;
  456. number_format.exponent = exponent;
  457. number_format.plurality = static_cast<Unicode::NumberFormat::Plurality>(plurality);
  458. number_format.zero_format = s_string_list[zero_format];
  459. number_format.positive_format = s_string_list[positive_format];
  460. number_format.negative_format = s_string_list[negative_format];
  461. number_format.identifiers.ensure_capacity(identifiers.size());
  462. for (@string_index_type@ identifier : identifiers)
  463. number_format.identifiers.append(s_string_list[identifier]);
  464. return number_format;
  465. }
  466. u8 magnitude { 0 };
  467. u8 exponent { 0 };
  468. u8 plurality { 0 };
  469. @string_index_type@ zero_format { 0 };
  470. @string_index_type@ positive_format { 0 };
  471. @string_index_type@ negative_format { 0 };
  472. Array<@string_index_type@, @identifier_count@> identifiers {};
  473. };
  474. struct NumberSystem {
  475. @string_index_type@ system { 0 };
  476. Array<@string_index_type@, @numeric_symbols_size@> symbols {};
  477. u8 primary_grouping_size { 0 };
  478. u8 secondary_grouping_size { 0 };
  479. NumberFormat decimal_format {};
  480. Span<NumberFormat const> decimal_long_formats {};
  481. Span<NumberFormat const> decimal_short_formats {};
  482. NumberFormat currency_format {};
  483. NumberFormat accounting_format {};
  484. Span<NumberFormat const> currency_unit_formats {};
  485. Span<NumberFormat const> currency_short_formats {};
  486. NumberFormat percent_format {};
  487. NumberFormat scientific_format {};
  488. };
  489. struct Unit {
  490. @string_index_type@ unit { 0 };
  491. Span<NumberFormat const> long_formats {};
  492. Span<NumberFormat const> short_formats {};
  493. Span<NumberFormat const> narrow_formats {};
  494. };
  495. )~~~");
  496. auto append_number_format = [&](auto const& number_format) {
  497. generator.set("magnitude"sv, String::number(number_format.magnitude));
  498. generator.set("exponent"sv, String::number(number_format.exponent));
  499. generator.set("plurality"sv, String::number(static_cast<u8>(number_format.plurality)));
  500. generator.set("zero_format"sv, String::number(number_format.zero_format_index));
  501. generator.set("positive_format"sv, String::number(number_format.positive_format_index));
  502. generator.set("negative_format"sv, String::number(number_format.negative_format_index));
  503. generator.append("{ @magnitude@, @exponent@, @plurality@, @zero_format@, @positive_format@, @negative_format@, { ");
  504. bool first = true;
  505. for (auto identifier_index : number_format.identifier_indices) {
  506. if (!first)
  507. generator.append(", ");
  508. generator.append(String::number(identifier_index));
  509. first = false;
  510. }
  511. generator.append(" } },");
  512. };
  513. auto append_number_formats = [&](String name, auto const& number_formats) {
  514. generator.set("name"sv, move(name));
  515. generator.set("size"sv, String::number(number_formats.size()));
  516. generator.append(R"~~~(
  517. static constexpr Array<NumberFormat, @size@> @name@ { {
  518. )~~~");
  519. constexpr size_t max_values_per_row = 10;
  520. size_t values_in_current_row = 0;
  521. for (auto const& number_format : number_formats) {
  522. if (values_in_current_row++ > 0)
  523. generator.append(" ");
  524. append_number_format(number_format);
  525. if (values_in_current_row == max_values_per_row) {
  526. values_in_current_row = 0;
  527. generator.append("\n ");
  528. }
  529. }
  530. generator.append(R"~~~(
  531. } };
  532. )~~~");
  533. };
  534. auto append_number_systems = [&](String name, auto const& number_systems) {
  535. auto format_name = [&](StringView system, StringView format) {
  536. return String::formatted("{}_{}_{}", name, system, format);
  537. };
  538. for (auto const& number_system : number_systems) {
  539. append_number_formats(format_name(number_system.key, "dl"sv), number_system.value.decimal_long_formats);
  540. append_number_formats(format_name(number_system.key, "ds"sv), number_system.value.decimal_short_formats);
  541. append_number_formats(format_name(number_system.key, "cu"sv), number_system.value.currency_unit_formats);
  542. append_number_formats(format_name(number_system.key, "cs"sv), number_system.value.currency_short_formats);
  543. }
  544. generator.set("name", name);
  545. generator.set("size", String::number(number_systems.size()));
  546. generator.append(R"~~~(
  547. static constexpr Array<NumberSystem, @size@> @name@ { {)~~~");
  548. for (auto const& number_system : number_systems) {
  549. generator.set("system"sv, String::number(number_system.value.system));
  550. generator.set("primary_grouping_size"sv, String::number(number_system.value.primary_grouping_size));
  551. generator.set("secondary_grouping_size"sv, String::number(number_system.value.secondary_grouping_size));
  552. generator.set("decimal_long_formats"sv, format_name(number_system.key, "dl"sv));
  553. generator.set("decimal_short_formats"sv, format_name(number_system.key, "ds"sv));
  554. generator.set("currency_unit_formats"sv, format_name(number_system.key, "cu"sv));
  555. generator.set("currency_short_formats"sv, format_name(number_system.key, "cs"sv));
  556. generator.append(R"~~~(
  557. { @system@, {)~~~");
  558. for (auto const& symbol : locale_data.numeric_symbols) {
  559. auto index = number_system.value.symbols.get(symbol).value_or(0);
  560. generator.set("index", String::number(index));
  561. generator.append(" @index@,");
  562. }
  563. generator.append(" }, @primary_grouping_size@, @secondary_grouping_size@, ");
  564. append_number_format(number_system.value.decimal_format);
  565. generator.append(" @decimal_long_formats@.span(), @decimal_short_formats@.span(), ");
  566. append_number_format(number_system.value.currency_format);
  567. generator.append(" ");
  568. append_number_format(number_system.value.accounting_format);
  569. generator.append(" @currency_unit_formats@.span(), @currency_short_formats@.span(), ");
  570. append_number_format(number_system.value.percent_format);
  571. generator.append(" ");
  572. append_number_format(number_system.value.scientific_format);
  573. generator.append(" },");
  574. }
  575. generator.append(R"~~~(
  576. } };
  577. )~~~");
  578. };
  579. auto append_units = [&](String name, auto const& units) {
  580. auto format_name = [&](String unit, StringView format) {
  581. unit = unit.replace("-"sv, "_"sv, true);
  582. return String::formatted("{}_{}_{}", name, unit, format);
  583. };
  584. for (auto const& unit : units) {
  585. append_number_formats(format_name(unit.key, "l"sv), unit.value.long_formats);
  586. append_number_formats(format_name(unit.key, "s"sv), unit.value.short_formats);
  587. append_number_formats(format_name(unit.key, "n"sv), unit.value.narrow_formats);
  588. }
  589. generator.set("name", name);
  590. generator.set("size", String::number(units.size()));
  591. generator.append(R"~~~(
  592. static constexpr Array<Unit, @size@> @name@ { {)~~~");
  593. for (auto const& unit : units) {
  594. generator.set("unit"sv, String::number(unit.value.unit));
  595. generator.set("long_formats"sv, format_name(unit.key, "l"sv));
  596. generator.set("short_formats"sv, format_name(unit.key, "s"sv));
  597. generator.set("narrow_formats"sv, format_name(unit.key, "n"sv));
  598. generator.append(R"~~~(
  599. { @unit@, @long_formats@.span(), @short_formats@.span(), @narrow_formats@.span() },)~~~");
  600. }
  601. generator.append(R"~~~(
  602. } };
  603. )~~~");
  604. };
  605. generate_mapping(generator, locale_data.locales, "NumberSystem"sv, "s_number_systems"sv, "s_number_systems_{}", [&](auto const& name, auto const& value) { append_number_systems(name, value.number_systems); });
  606. generate_mapping(generator, locale_data.locales, "Unit"sv, "s_units"sv, "s_units_{}", [&](auto const& name, auto const& value) { append_units(name, value.units); });
  607. auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values) {
  608. HashValueMap<String> hashes;
  609. hashes.ensure_capacity(values.size());
  610. for (auto const& value : values)
  611. hashes.set(value.hash(), format_identifier(enum_title, value));
  612. generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes));
  613. };
  614. append_from_string("NumericSymbol"sv, "numeric_symbol"sv, locale_data.numeric_symbols);
  615. generator.append(R"~~~(
  616. static NumberSystem const* find_number_system(StringView locale, StringView system)
  617. {
  618. auto locale_value = locale_from_string(locale);
  619. if (!locale_value.has_value())
  620. return nullptr;
  621. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  622. auto const& number_systems = s_number_systems.at(locale_index);
  623. for (auto const& number_system : number_systems) {
  624. if (system == s_string_list[number_system.system])
  625. return &number_system;
  626. };
  627. return nullptr;
  628. }
  629. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, StringView symbol)
  630. {
  631. auto symbol_value = numeric_symbol_from_string(symbol);
  632. if (!symbol_value.has_value())
  633. return {};
  634. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  635. auto symbol_index = to_underlying(*symbol_value);
  636. return s_string_list[number_system->symbols[symbol_index]];
  637. }
  638. return {};
  639. }
  640. Optional<NumberGroupings> get_number_system_groupings(StringView locale, StringView system)
  641. {
  642. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr)
  643. return NumberGroupings { number_system->primary_grouping_size, number_system->secondary_grouping_size };
  644. return {};
  645. }
  646. Optional<Unicode::NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type)
  647. {
  648. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  649. switch (type) {
  650. case StandardNumberFormatType::Decimal:
  651. return number_system->decimal_format.to_unicode_number_format();
  652. case StandardNumberFormatType::Currency:
  653. return number_system->currency_format.to_unicode_number_format();
  654. case StandardNumberFormatType::Accounting:
  655. return number_system->accounting_format.to_unicode_number_format();
  656. case StandardNumberFormatType::Percent:
  657. return number_system->percent_format.to_unicode_number_format();
  658. case StandardNumberFormatType::Scientific:
  659. return number_system->scientific_format.to_unicode_number_format();
  660. }
  661. }
  662. return {};
  663. }
  664. Vector<Unicode::NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type)
  665. {
  666. Vector<Unicode::NumberFormat> formats;
  667. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  668. Span<NumberFormat const> number_formats;
  669. switch (type) {
  670. case CompactNumberFormatType::DecimalLong:
  671. number_formats = number_system->decimal_long_formats;
  672. break;
  673. case CompactNumberFormatType::DecimalShort:
  674. number_formats = number_system->decimal_short_formats;
  675. break;
  676. case CompactNumberFormatType::CurrencyUnit:
  677. number_formats = number_system->currency_unit_formats;
  678. break;
  679. case CompactNumberFormatType::CurrencyShort:
  680. number_formats = number_system->currency_short_formats;
  681. break;
  682. }
  683. formats.ensure_capacity(number_formats.size());
  684. for (auto const& number_format : number_formats)
  685. formats.append(number_format.to_unicode_number_format());
  686. }
  687. return formats;
  688. }
  689. static Unit const* find_units(StringView locale, StringView unit)
  690. {
  691. auto locale_value = locale_from_string(locale);
  692. if (!locale_value.has_value())
  693. return nullptr;
  694. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  695. auto const& locale_units = s_units.at(locale_index);
  696. for (auto const& units : locale_units) {
  697. if (unit == s_string_list[units.unit])
  698. return &units;
  699. };
  700. return nullptr;
  701. }
  702. Vector<Unicode::NumberFormat> get_unit_formats(StringView locale, StringView unit, Style style)
  703. {
  704. Vector<Unicode::NumberFormat> formats;
  705. if (auto const* units = find_units(locale, unit); units != nullptr) {
  706. Span<NumberFormat const> number_formats;
  707. switch (style) {
  708. case Style::Long:
  709. number_formats = units->long_formats;
  710. break;
  711. case Style::Short:
  712. number_formats = units->short_formats;
  713. break;
  714. case Style::Narrow:
  715. number_formats = units->narrow_formats;
  716. break;
  717. default:
  718. VERIFY_NOT_REACHED();
  719. }
  720. formats.ensure_capacity(number_formats.size());
  721. for (auto const& number_format : number_formats)
  722. formats.append(number_format.to_unicode_number_format());
  723. }
  724. return formats;
  725. }
  726. }
  727. )~~~");
  728. file.write(generator.as_string_view());
  729. }
  730. int main(int argc, char** argv)
  731. {
  732. char const* generated_header_path = nullptr;
  733. char const* generated_implementation_path = nullptr;
  734. char const* core_path = nullptr;
  735. char const* numbers_path = nullptr;
  736. char const* units_path = nullptr;
  737. Core::ArgsParser args_parser;
  738. args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
  739. args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
  740. args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path");
  741. args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
  742. args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path");
  743. args_parser.parse(argc, argv);
  744. auto open_file = [&](StringView path, StringView flags, Core::OpenMode mode = Core::OpenMode::ReadOnly) {
  745. if (path.is_empty()) {
  746. warnln("{} is required", flags);
  747. args_parser.print_usage(stderr, argv[0]);
  748. exit(1);
  749. }
  750. auto file_or_error = Core::File::open(path, mode);
  751. if (file_or_error.is_error()) {
  752. warnln("Failed to open {}: {}", path, file_or_error.release_error());
  753. exit(1);
  754. }
  755. return file_or_error.release_value();
  756. };
  757. auto generated_header_file = open_file(generated_header_path, "-h/--generated-header-path", Core::OpenMode::ReadWrite);
  758. auto generated_implementation_file = open_file(generated_implementation_path, "-c/--generated-implementation-path", Core::OpenMode::ReadWrite);
  759. UnicodeLocaleData locale_data;
  760. parse_all_locales(core_path, numbers_path, units_path, locale_data);
  761. generate_unicode_locale_header(generated_header_file, locale_data);
  762. generate_unicode_locale_implementation(generated_implementation_file, locale_data);
  763. return 0;
  764. }