GenerateUnicodeNumberFormat.cpp 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "GeneratorUtil.h"
  7. #include <AK/AllOf.h>
  8. #include <AK/Array.h>
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Find.h>
  11. #include <AK/Format.h>
  12. #include <AK/HashFunctions.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/JsonObject.h>
  15. #include <AK/JsonParser.h>
  16. #include <AK/JsonValue.h>
  17. #include <AK/LexicalPath.h>
  18. #include <AK/QuickSort.h>
  19. #include <AK/SourceGenerator.h>
  20. #include <AK/String.h>
  21. #include <AK/StringBuilder.h>
  22. #include <AK/Traits.h>
  23. #include <AK/Utf8View.h>
  24. #include <LibCore/ArgsParser.h>
  25. #include <LibCore/DirIterator.h>
  26. #include <LibCore/File.h>
  27. #include <LibUnicode/Locale.h>
  28. #include <LibUnicode/NumberFormat.h>
  29. #include <math.h>
  30. using StringIndexType = u16;
  31. constexpr auto s_string_index_type = "u16"sv;
  32. using NumberFormatIndexType = u16;
  33. constexpr auto s_number_format_index_type = "u16"sv;
  34. using NumberFormatListIndexType = u16;
  35. constexpr auto s_number_format_list_index_type = "u16"sv;
  36. enum class NumberFormatType {
  37. Standard,
  38. Compact,
  39. };
  40. struct NumberFormat : public Unicode::NumberFormat {
  41. using Base = Unicode::NumberFormat;
  42. static Base::Plurality plurality_from_string(StringView plurality)
  43. {
  44. if (plurality == "other"sv)
  45. return Base::Plurality::Other;
  46. if (plurality == "1"sv)
  47. return Base::Plurality::Single;
  48. if (plurality == "zero"sv)
  49. return Base::Plurality::Zero;
  50. if (plurality == "one"sv)
  51. return Base::Plurality::One;
  52. if (plurality == "two"sv)
  53. return Base::Plurality::Two;
  54. if (plurality == "few"sv)
  55. return Base::Plurality::Few;
  56. if (plurality == "many"sv)
  57. return Base::Plurality::Many;
  58. VERIFY_NOT_REACHED();
  59. }
  60. unsigned hash() const
  61. {
  62. auto hash = pair_int_hash(magnitude, exponent);
  63. hash = pair_int_hash(hash, static_cast<u8>(plurality));
  64. hash = pair_int_hash(hash, zero_format_index);
  65. hash = pair_int_hash(hash, positive_format_index);
  66. hash = pair_int_hash(hash, negative_format_index);
  67. for (auto index : identifier_indices)
  68. hash = pair_int_hash(hash, index);
  69. return hash;
  70. }
  71. bool operator==(NumberFormat const& other) const
  72. {
  73. return (magnitude == other.magnitude)
  74. && (exponent == other.exponent)
  75. && (plurality == other.plurality)
  76. && (zero_format_index == other.zero_format_index)
  77. && (positive_format_index == other.positive_format_index)
  78. && (negative_format_index == other.negative_format_index)
  79. && (identifier_indices == other.identifier_indices);
  80. }
  81. StringIndexType zero_format_index { 0 };
  82. StringIndexType positive_format_index { 0 };
  83. StringIndexType negative_format_index { 0 };
  84. Vector<StringIndexType> identifier_indices {};
  85. };
  86. template<>
  87. struct AK::Formatter<NumberFormat> : Formatter<FormatString> {
  88. ErrorOr<void> format(FormatBuilder& builder, NumberFormat const& format)
  89. {
  90. StringBuilder identifier_indices;
  91. identifier_indices.join(", "sv, format.identifier_indices);
  92. return Formatter<FormatString>::format(builder,
  93. "{{ {}, {}, {}, {}, {}, {}, {{ {} }} }}",
  94. format.magnitude,
  95. format.exponent,
  96. static_cast<u8>(format.plurality),
  97. format.zero_format_index,
  98. format.positive_format_index,
  99. format.negative_format_index,
  100. identifier_indices.build());
  101. }
  102. };
  103. template<>
  104. struct AK::Traits<NumberFormat> : public GenericTraits<NumberFormat> {
  105. static unsigned hash(NumberFormat const& f) { return f.hash(); }
  106. };
  107. using NumberFormatList = Vector<NumberFormatIndexType>;
  108. template<>
  109. struct AK::Traits<NumberFormatList> : public GenericTraits<NumberFormatList> {
  110. static unsigned hash(NumberFormatList const& formats)
  111. {
  112. auto hash = int_hash(static_cast<u32>(formats.size()));
  113. for (auto format : formats)
  114. hash = pair_int_hash(hash, format);
  115. return hash;
  116. }
  117. };
  118. struct NumberSystem {
  119. StringIndexType system { 0 };
  120. HashMap<String, StringIndexType> symbols {};
  121. u8 primary_grouping_size { 0 };
  122. u8 secondary_grouping_size { 0 };
  123. NumberFormatIndexType decimal_format { 0 };
  124. NumberFormatListIndexType decimal_long_formats { 0 };
  125. NumberFormatListIndexType decimal_short_formats { 0 };
  126. NumberFormatIndexType currency_format { 0 };
  127. NumberFormatIndexType accounting_format { 0 };
  128. NumberFormatListIndexType currency_unit_formats { 0 };
  129. NumberFormatListIndexType currency_short_formats { 0 };
  130. NumberFormatIndexType percent_format { 0 };
  131. NumberFormatIndexType scientific_format { 0 };
  132. };
  133. struct Unit {
  134. StringIndexType unit { 0 };
  135. Vector<NumberFormatIndexType> long_formats {};
  136. Vector<NumberFormatIndexType> short_formats {};
  137. Vector<NumberFormatIndexType> narrow_formats {};
  138. };
  139. struct Locale {
  140. HashMap<String, NumberSystem> number_systems;
  141. HashMap<String, Unit> units {};
  142. };
  143. struct UnicodeLocaleData {
  144. UniqueStringStorage<StringIndexType> unique_strings;
  145. UniqueStorage<NumberFormat, NumberFormatIndexType> unique_formats;
  146. UniqueStorage<NumberFormatList, NumberFormatListIndexType> unique_format_lists;
  147. HashMap<String, Locale> locales;
  148. Vector<String> numeric_symbols;
  149. size_t max_identifier_count { 0 };
  150. };
  151. static String parse_identifiers(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format)
  152. {
  153. static Utf8View whitespace { "\u0020\u00a0\u200f"sv };
  154. while (true) {
  155. Utf8View utf8_pattern { pattern };
  156. Optional<size_t> start_index;
  157. Optional<size_t> end_index;
  158. bool inside_replacement = false;
  159. for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
  160. if (*it == '{') {
  161. if (start_index.has_value()) {
  162. end_index = utf8_pattern.byte_offset_of(it);
  163. break;
  164. }
  165. inside_replacement = true;
  166. } else if (*it == '}') {
  167. inside_replacement = false;
  168. } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) {
  169. start_index = utf8_pattern.byte_offset_of(it);
  170. }
  171. }
  172. if (!start_index.has_value())
  173. return pattern;
  174. end_index = end_index.value_or(pattern.length());
  175. utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index);
  176. utf8_pattern = utf8_pattern.trim(whitespace);
  177. auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv);
  178. auto identifier_index = locale_data.unique_strings.ensure(move(identifier));
  179. size_t replacement_index = 0;
  180. if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) {
  181. replacement_index = *index;
  182. } else {
  183. replacement_index = format.identifier_indices.size();
  184. format.identifier_indices.append(identifier_index);
  185. locale_data.max_identifier_count = max(locale_data.max_identifier_count, format.identifier_indices.size());
  186. }
  187. pattern = String::formatted("{}{{{}:{}}}{}",
  188. *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv,
  189. replacement,
  190. replacement_index,
  191. pattern.substring_view(*start_index + utf8_pattern.byte_length()));
  192. }
  193. }
  194. static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
  195. {
  196. // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
  197. // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
  198. VERIFY((patterns.size() == 1) || (patterns.size() == 2));
  199. auto replace_patterns = [&](String pattern) {
  200. static HashMap<StringView, StringView> replacements = {
  201. { "{0}"sv, "{number}"sv },
  202. { "{1}"sv, "{currency}"sv },
  203. { "%"sv, "{percentSign}"sv },
  204. { "+"sv, "{plusSign}"sv },
  205. { "-"sv, "{minusSign}"sv },
  206. { "¤"sv, "{currency}"sv }, // U+00A4 Currency Sign
  207. { "E"sv, "{scientificSeparator}"sv },
  208. };
  209. for (auto const& replacement : replacements)
  210. pattern = pattern.replace(replacement.key, replacement.value, true);
  211. if (auto start_number_index = pattern.find_any_of("#0"sv, String::SearchDirection::Forward); start_number_index.has_value()) {
  212. auto end_number_index = *start_number_index + 1;
  213. for (; end_number_index < pattern.length(); ++end_number_index) {
  214. auto ch = pattern[end_number_index];
  215. if ((ch != '#') && (ch != '0') && (ch != ',') && (ch != '.'))
  216. break;
  217. }
  218. if (number_system_for_groupings) {
  219. auto number_pattern = pattern.substring_view(*start_number_index, end_number_index - *start_number_index);
  220. auto group_separators = number_pattern.find_all(","sv);
  221. VERIFY((group_separators.size() == 1) || (group_separators.size() == 2));
  222. auto decimal = number_pattern.find('.');
  223. VERIFY(decimal.has_value());
  224. if (group_separators.size() == 1) {
  225. number_system_for_groupings->primary_grouping_size = *decimal - group_separators[0] - 1;
  226. number_system_for_groupings->secondary_grouping_size = number_system_for_groupings->primary_grouping_size;
  227. } else {
  228. number_system_for_groupings->primary_grouping_size = *decimal - group_separators[1] - 1;
  229. number_system_for_groupings->secondary_grouping_size = group_separators[1] - group_separators[0] - 1;
  230. }
  231. }
  232. pattern = String::formatted("{}{{number}}{}",
  233. *start_number_index > 0 ? pattern.substring_view(0, *start_number_index) : ""sv,
  234. pattern.substring_view(end_number_index));
  235. // This is specifically handled here rather than in the replacements HashMap above so
  236. // that we do not errantly replace zeroes in number patterns.
  237. if (pattern.contains(*replacements.get("E"sv)))
  238. pattern = pattern.replace("0"sv, "{scientificExponent}"sv);
  239. }
  240. if (type == NumberFormatType::Compact)
  241. return parse_identifiers(move(pattern), "compactIdentifier"sv, locale_data, format);
  242. return pattern;
  243. };
  244. auto zero_format = replace_patterns(move(patterns[0]));
  245. format.positive_format_index = locale_data.unique_strings.ensure(String::formatted("{{plusSign}}{}", zero_format));
  246. if (patterns.size() == 2) {
  247. auto negative_format = replace_patterns(move(patterns[1]));
  248. format.negative_format_index = locale_data.unique_strings.ensure(move(negative_format));
  249. } else {
  250. format.negative_format_index = locale_data.unique_strings.ensure(String::formatted("{{minusSign}}{}", zero_format));
  251. }
  252. format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format));
  253. }
  254. static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormatIndexType& format_index, NumberSystem* number_system_for_groupings = nullptr)
  255. {
  256. NumberFormat format {};
  257. parse_number_pattern(move(patterns), locale_data, type, format, number_system_for_groupings);
  258. format_index = locale_data.unique_formats.ensure(move(format));
  259. }
  260. static ErrorOr<void> parse_number_systems(String locale_numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
  261. {
  262. LexicalPath numbers_path(move(locale_numbers_path));
  263. numbers_path = numbers_path.append("numbers.json"sv);
  264. auto numbers_file = TRY(Core::File::open(numbers_path.string(), Core::OpenMode::ReadOnly));
  265. auto numbers = TRY(JsonValue::from_string(numbers_file->read_all()));
  266. auto const& main_object = numbers.as_object().get("main"sv);
  267. auto const& locale_object = main_object.as_object().get(numbers_path.parent().basename());
  268. auto const& locale_numbers_object = locale_object.as_object().get("numbers"sv);
  269. auto ensure_number_system = [&](auto const& system) -> NumberSystem& {
  270. return locale.number_systems.ensure(system, [&]() {
  271. auto system_index = locale_data.unique_strings.ensure(system);
  272. return NumberSystem { .system = system_index };
  273. });
  274. };
  275. auto parse_number_format = [&](auto const& format_object) {
  276. Vector<NumberFormatIndexType> result;
  277. result.ensure_capacity(format_object.size());
  278. format_object.for_each_member([&](auto const& key, JsonValue const& value) {
  279. auto split_key = key.split_view('-');
  280. if (split_key.size() != 3)
  281. return;
  282. auto patterns = value.as_string().split(';');
  283. NumberFormat format {};
  284. if (auto type = split_key[0].template to_uint<u64>(); type.has_value()) {
  285. VERIFY(*type % 10 == 0);
  286. format.magnitude = static_cast<u8>(log10(*type));
  287. if (patterns[0] != "0"sv) {
  288. auto number_of_zeroes_in_pattern = patterns[0].count("0"sv);
  289. VERIFY(format.magnitude >= number_of_zeroes_in_pattern);
  290. format.exponent = format.magnitude + 1 - number_of_zeroes_in_pattern;
  291. }
  292. } else {
  293. VERIFY(split_key[0] == "unitPattern"sv);
  294. }
  295. format.plurality = NumberFormat::plurality_from_string(split_key[2]);
  296. parse_number_pattern(move(patterns), locale_data, NumberFormatType::Compact, format);
  297. auto format_index = locale_data.unique_formats.ensure(move(format));
  298. result.append(format_index);
  299. });
  300. return locale_data.unique_format_lists.ensure(move(result));
  301. };
  302. locale_numbers_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  303. constexpr auto symbols_prefix = "symbols-numberSystem-"sv;
  304. constexpr auto decimal_formats_prefix = "decimalFormats-numberSystem-"sv;
  305. constexpr auto currency_formats_prefix = "currencyFormats-numberSystem-"sv;
  306. constexpr auto percent_formats_prefix = "percentFormats-numberSystem-"sv;
  307. constexpr auto scientific_formats_prefix = "scientificFormats-numberSystem-"sv;
  308. if (key.starts_with(symbols_prefix)) {
  309. auto system = key.substring(symbols_prefix.length());
  310. auto& number_system = ensure_number_system(system);
  311. value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) {
  312. auto symbol_index = locale_data.unique_strings.ensure(localization.as_string());
  313. number_system.symbols.set(symbol, symbol_index);
  314. if (!locale_data.numeric_symbols.contains_slow(symbol))
  315. locale_data.numeric_symbols.append(symbol);
  316. });
  317. } else if (key.starts_with(decimal_formats_prefix)) {
  318. auto system = key.substring(decimal_formats_prefix.length());
  319. auto& number_system = ensure_number_system(system);
  320. auto format_object = value.as_object().get("standard"sv);
  321. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.decimal_format, &number_system);
  322. auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv);
  323. number_system.decimal_long_formats = parse_number_format(long_format.as_object());
  324. auto const& short_format = value.as_object().get("short"sv).as_object().get("decimalFormat"sv);
  325. number_system.decimal_short_formats = parse_number_format(short_format.as_object());
  326. } else if (key.starts_with(currency_formats_prefix)) {
  327. auto system = key.substring(currency_formats_prefix.length());
  328. auto& number_system = ensure_number_system(system);
  329. auto format_object = value.as_object().get("standard"sv);
  330. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.currency_format);
  331. format_object = value.as_object().get("accounting"sv);
  332. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.accounting_format);
  333. number_system.currency_unit_formats = parse_number_format(value.as_object());
  334. if (value.as_object().has("short"sv)) {
  335. auto const& short_format = value.as_object().get("short"sv).as_object().get("standard"sv);
  336. number_system.currency_short_formats = parse_number_format(short_format.as_object());
  337. }
  338. } else if (key.starts_with(percent_formats_prefix)) {
  339. auto system = key.substring(percent_formats_prefix.length());
  340. auto& number_system = ensure_number_system(system);
  341. auto format_object = value.as_object().get("standard"sv);
  342. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.percent_format);
  343. } else if (key.starts_with(scientific_formats_prefix)) {
  344. auto system = key.substring(scientific_formats_prefix.length());
  345. auto& number_system = ensure_number_system(system);
  346. auto format_object = value.as_object().get("standard"sv);
  347. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.scientific_format);
  348. }
  349. });
  350. return {};
  351. }
  352. static ErrorOr<void> parse_units(String locale_units_path, UnicodeLocaleData& locale_data, Locale& locale)
  353. {
  354. LexicalPath units_path(move(locale_units_path));
  355. units_path = units_path.append("units.json"sv);
  356. auto units_file = TRY(Core::File::open(units_path.string(), Core::OpenMode::ReadOnly));
  357. auto units = TRY(JsonValue::from_string(units_file->read_all()));
  358. auto const& main_object = units.as_object().get("main"sv);
  359. auto const& locale_object = main_object.as_object().get(units_path.parent().basename());
  360. auto const& locale_units_object = locale_object.as_object().get("units"sv);
  361. auto const& long_object = locale_units_object.as_object().get("long"sv);
  362. auto const& short_object = locale_units_object.as_object().get("short"sv);
  363. auto const& narrow_object = locale_units_object.as_object().get("narrow"sv);
  364. auto ensure_unit = [&](auto const& unit) -> Unit& {
  365. return locale.units.ensure(unit, [&]() {
  366. auto unit_index = locale_data.unique_strings.ensure(unit);
  367. return Unit { .unit = unit_index };
  368. });
  369. };
  370. auto is_sanctioned_unit = [](StringView unit_name) {
  371. // This is a copy of the units sanctioned for use within ECMA-402. LibUnicode generally tries to
  372. // avoid being directly dependent on ECMA-402, but this rather significantly reduces the amount
  373. // of data generated here, and ECMA-402 is currently the only consumer of this data.
  374. // https://tc39.es/ecma402/#table-sanctioned-simple-unit-identifiers
  375. constexpr auto sanctioned_units = AK::Array { "acre"sv, "bit"sv, "byte"sv, "celsius"sv, "centimeter"sv, "day"sv, "degree"sv, "fahrenheit"sv, "fluid-ounce"sv, "foot"sv, "gallon"sv, "gigabit"sv, "gigabyte"sv, "gram"sv, "hectare"sv, "hour"sv, "inch"sv, "kilobit"sv, "kilobyte"sv, "kilogram"sv, "kilometer"sv, "liter"sv, "megabit"sv, "megabyte"sv, "meter"sv, "mile"sv, "mile-scandinavian"sv, "milliliter"sv, "millimeter"sv, "millisecond"sv, "minute"sv, "month"sv, "ounce"sv, "percent"sv, "petabyte"sv, "pound"sv, "second"sv, "stone"sv, "terabit"sv, "terabyte"sv, "week"sv, "yard"sv, "year"sv };
  376. return find(sanctioned_units.begin(), sanctioned_units.end(), unit_name) != sanctioned_units.end();
  377. };
  378. auto parse_units_object = [&](auto const& units_object, Unicode::Style style) {
  379. constexpr auto unit_pattern_prefix = "unitPattern-count-"sv;
  380. constexpr auto combined_unit_separator = "-per-"sv;
  381. units_object.for_each_member([&](auto const& key, JsonValue const& value) {
  382. auto end_of_category = key.find('-');
  383. if (!end_of_category.has_value())
  384. return;
  385. auto unit_name = key.substring(*end_of_category + 1);
  386. if (!is_sanctioned_unit(unit_name)) {
  387. auto indices = unit_name.find_all(combined_unit_separator);
  388. if (indices.size() != 1)
  389. return;
  390. auto numerator = unit_name.substring_view(0, indices[0]);
  391. auto denominator = unit_name.substring_view(indices[0] + combined_unit_separator.length());
  392. if (!is_sanctioned_unit(numerator) || !is_sanctioned_unit(denominator))
  393. return;
  394. }
  395. value.as_object().for_each_member([&](auto const& unit_key, JsonValue const& pattern_value) {
  396. if (!unit_key.starts_with(unit_pattern_prefix))
  397. return;
  398. auto& unit = ensure_unit(unit_name);
  399. NumberFormat format {};
  400. auto plurality = unit_key.substring_view(unit_pattern_prefix.length());
  401. format.plurality = NumberFormat::plurality_from_string(plurality);
  402. auto zero_format = pattern_value.as_string().replace("{0}"sv, "{number}"sv);
  403. zero_format = parse_identifiers(zero_format, "unitIdentifier"sv, locale_data, format);
  404. format.positive_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{plusSign}{number}"sv));
  405. format.negative_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{minusSign}{number}"sv));
  406. format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format));
  407. auto format_index = locale_data.unique_formats.ensure(move(format));
  408. switch (style) {
  409. case Unicode::Style::Long:
  410. unit.long_formats.append(format_index);
  411. break;
  412. case Unicode::Style::Short:
  413. unit.short_formats.append(format_index);
  414. break;
  415. case Unicode::Style::Narrow:
  416. unit.narrow_formats.append(format_index);
  417. break;
  418. default:
  419. VERIFY_NOT_REACHED();
  420. }
  421. });
  422. });
  423. };
  424. parse_units_object(long_object.as_object(), Unicode::Style::Long);
  425. parse_units_object(short_object.as_object(), Unicode::Style::Short);
  426. parse_units_object(narrow_object.as_object(), Unicode::Style::Narrow);
  427. return {};
  428. }
  429. static ErrorOr<void> parse_all_locales(String numbers_path, String units_path, UnicodeLocaleData& locale_data)
  430. {
  431. auto numbers_iterator = TRY(path_to_dir_iterator(move(numbers_path)));
  432. auto units_iterator = TRY(path_to_dir_iterator(move(units_path)));
  433. auto remove_variants_from_path = [&](String path) -> ErrorOr<String> {
  434. auto parsed_locale = TRY(CanonicalLanguageID<StringIndexType>::parse(locale_data.unique_strings, LexicalPath::basename(path)));
  435. StringBuilder builder;
  436. builder.append(locale_data.unique_strings.get(parsed_locale.language));
  437. if (auto script = locale_data.unique_strings.get(parsed_locale.script); !script.is_empty())
  438. builder.appendff("-{}", script);
  439. if (auto region = locale_data.unique_strings.get(parsed_locale.region); !region.is_empty())
  440. builder.appendff("-{}", region);
  441. return builder.build();
  442. };
  443. while (numbers_iterator.has_next()) {
  444. auto numbers_path = TRY(next_path_from_dir_iterator(numbers_iterator));
  445. auto language = TRY(remove_variants_from_path(numbers_path));
  446. auto& locale = locale_data.locales.ensure(language);
  447. TRY(parse_number_systems(numbers_path, locale_data, locale));
  448. }
  449. while (units_iterator.has_next()) {
  450. auto units_path = TRY(next_path_from_dir_iterator(units_iterator));
  451. auto language = TRY(remove_variants_from_path(units_path));
  452. auto& locale = locale_data.locales.ensure(language);
  453. TRY(parse_units(units_path, locale_data, locale));
  454. }
  455. return {};
  456. }
  457. static String format_identifier(StringView owner, String identifier)
  458. {
  459. identifier = identifier.replace("-"sv, "_"sv, true);
  460. if (all_of(identifier, is_ascii_digit))
  461. return String::formatted("{}_{}", owner[0], identifier);
  462. if (is_ascii_lower_alpha(identifier[0]))
  463. return String::formatted("{:c}{}", to_ascii_uppercase(identifier[0]), identifier.substring_view(1));
  464. return identifier;
  465. }
  466. static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData& locale_data)
  467. {
  468. StringBuilder builder;
  469. SourceGenerator generator { builder };
  470. generator.append(R"~~~(
  471. #pragma once
  472. #include <AK/Optional.h>
  473. #include <AK/StringView.h>
  474. #include <AK/Types.h>
  475. #include <AK/Vector.h>
  476. #include <LibUnicode/Forward.h>
  477. namespace Unicode {
  478. )~~~");
  479. generate_enum(generator, format_identifier, "NumericSymbol"sv, {}, locale_data.numeric_symbols);
  480. generator.append(R"~~~(
  481. namespace Detail {
  482. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, StringView numeric_symbol);
  483. Optional<NumberGroupings> get_number_system_groupings(StringView locale, StringView system);
  484. Optional<NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type);
  485. Vector<NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type);
  486. Vector<Unicode::NumberFormat> get_unit_formats(StringView locale, StringView unit, Style style);
  487. Optional<NumericSymbol> numeric_symbol_from_string(StringView numeric_symbol);
  488. }
  489. }
  490. )~~~");
  491. VERIFY(file.write(generator.as_string_view()));
  492. }
  493. static void generate_unicode_locale_implementation(Core::File& file, UnicodeLocaleData& locale_data)
  494. {
  495. StringBuilder builder;
  496. SourceGenerator generator { builder };
  497. generator.set("string_index_type"sv, s_string_index_type);
  498. generator.set("number_format_index_type"sv, s_number_format_index_type);
  499. generator.set("number_format_list_index_type"sv, s_number_format_list_index_type);
  500. generator.set("numeric_symbols_size", String::number(locale_data.numeric_symbols.size()));
  501. generator.set("identifier_count", String::number(locale_data.max_identifier_count));
  502. generator.append(R"~~~(
  503. #include <AK/Array.h>
  504. #include <AK/BinarySearch.h>
  505. #include <AK/Span.h>
  506. #include <LibUnicode/Locale.h>
  507. #include <LibUnicode/NumberFormat.h>
  508. #include <LibUnicode/UnicodeNumberFormat.h>
  509. namespace Unicode::Detail {
  510. )~~~");
  511. locale_data.unique_strings.generate(generator);
  512. generator.append(R"~~~(
  513. struct NumberFormat {
  514. Unicode::NumberFormat to_unicode_number_format() const {
  515. Unicode::NumberFormat number_format {};
  516. number_format.magnitude = magnitude;
  517. number_format.exponent = exponent;
  518. number_format.plurality = static_cast<Unicode::NumberFormat::Plurality>(plurality);
  519. number_format.zero_format = s_string_list[zero_format];
  520. number_format.positive_format = s_string_list[positive_format];
  521. number_format.negative_format = s_string_list[negative_format];
  522. number_format.identifiers.ensure_capacity(identifiers.size());
  523. for (@string_index_type@ identifier : identifiers)
  524. number_format.identifiers.append(s_string_list[identifier]);
  525. return number_format;
  526. }
  527. u8 magnitude { 0 };
  528. u8 exponent { 0 };
  529. u8 plurality { 0 };
  530. @string_index_type@ zero_format { 0 };
  531. @string_index_type@ positive_format { 0 };
  532. @string_index_type@ negative_format { 0 };
  533. Array<@string_index_type@, @identifier_count@> identifiers {};
  534. };
  535. struct NumberSystem {
  536. @string_index_type@ system { 0 };
  537. Array<@string_index_type@, @numeric_symbols_size@> symbols {};
  538. u8 primary_grouping_size { 0 };
  539. u8 secondary_grouping_size { 0 };
  540. @number_format_index_type@ decimal_format { 0 };
  541. @number_format_list_index_type@ decimal_long_formats { 0 };
  542. @number_format_list_index_type@ decimal_short_formats { 0 };
  543. @number_format_index_type@ currency_format { 0 };
  544. @number_format_index_type@ accounting_format { 0 };
  545. @number_format_list_index_type@ currency_unit_formats { 0 };
  546. @number_format_list_index_type@ currency_short_formats { 0 };
  547. @number_format_index_type@ percent_format { 0 };
  548. @number_format_index_type@ scientific_format { 0 };
  549. };
  550. struct Unit {
  551. @string_index_type@ unit { 0 };
  552. Span<@number_format_index_type@ const> long_formats {};
  553. Span<@number_format_index_type@ const> short_formats {};
  554. Span<@number_format_index_type@ const> narrow_formats {};
  555. };
  556. )~~~");
  557. locale_data.unique_formats.generate(generator, "NumberFormat"sv, "s_number_formats"sv, 10);
  558. locale_data.unique_format_lists.generate(generator, s_number_format_index_type, "s_number_format_lists"sv);
  559. auto append_number_formats = [&](String name, auto const& number_formats) {
  560. generator.set("name"sv, move(name));
  561. generator.set("size"sv, String::number(number_formats.size()));
  562. generator.append(R"~~~(
  563. static constexpr Array<@number_format_index_type@, @size@> @name@ { {)~~~");
  564. bool first = true;
  565. for (auto number_format : number_formats) {
  566. generator.append(first ? " " : ", ");
  567. generator.append(String::number(number_format));
  568. first = false;
  569. }
  570. generator.append(" } };");
  571. };
  572. auto append_number_systems = [&](String name, auto const& number_systems) {
  573. generator.set("name", name);
  574. generator.set("size", String::number(number_systems.size()));
  575. generator.append(R"~~~(
  576. static constexpr Array<NumberSystem, @size@> @name@ { {)~~~");
  577. for (auto const& number_system : number_systems) {
  578. generator.set("system"sv, String::number(number_system.value.system));
  579. generator.set("primary_grouping_size"sv, String::number(number_system.value.primary_grouping_size));
  580. generator.set("secondary_grouping_size"sv, String::number(number_system.value.secondary_grouping_size));
  581. generator.set("decimal_format", String::number(number_system.value.decimal_format));
  582. generator.set("decimal_long_formats"sv, String::number(number_system.value.decimal_long_formats));
  583. generator.set("decimal_short_formats"sv, String::number(number_system.value.decimal_short_formats));
  584. generator.set("currency_format", String::number(number_system.value.currency_format));
  585. generator.set("accounting_format", String::number(number_system.value.accounting_format));
  586. generator.set("currency_unit_formats"sv, String::number(number_system.value.currency_unit_formats));
  587. generator.set("currency_short_formats"sv, String::number(number_system.value.currency_short_formats));
  588. generator.set("percent_format", String::number(number_system.value.percent_format));
  589. generator.set("scientific_format", String::number(number_system.value.scientific_format));
  590. generator.append(R"~~~(
  591. { @system@, {)~~~");
  592. for (auto const& symbol : locale_data.numeric_symbols) {
  593. auto index = number_system.value.symbols.get(symbol).value_or(0);
  594. generator.set("index", String::number(index));
  595. generator.append(" @index@,");
  596. }
  597. generator.append(" }, @primary_grouping_size@, @secondary_grouping_size@, ");
  598. generator.append("@decimal_format@, @decimal_long_formats@, @decimal_short_formats@, ");
  599. generator.append("@currency_format@, @accounting_format@, @currency_unit_formats@, @currency_short_formats@, ");
  600. generator.append("@percent_format@, @scientific_format@ },");
  601. }
  602. generator.append(R"~~~(
  603. } };
  604. )~~~");
  605. };
  606. auto append_units = [&](String name, auto const& units) {
  607. auto format_name = [&](String unit, StringView format) {
  608. unit = unit.replace("-"sv, "_"sv, true);
  609. return String::formatted("{}_{}_{}", name, unit, format);
  610. };
  611. for (auto const& unit : units) {
  612. append_number_formats(format_name(unit.key, "l"sv), unit.value.long_formats);
  613. append_number_formats(format_name(unit.key, "s"sv), unit.value.short_formats);
  614. append_number_formats(format_name(unit.key, "n"sv), unit.value.narrow_formats);
  615. }
  616. generator.set("name", name);
  617. generator.set("size", String::number(units.size()));
  618. generator.append(R"~~~(
  619. static constexpr Array<Unit, @size@> @name@ { {)~~~");
  620. for (auto const& unit : units) {
  621. generator.set("unit"sv, String::number(unit.value.unit));
  622. generator.set("long_formats"sv, format_name(unit.key, "l"sv));
  623. generator.set("short_formats"sv, format_name(unit.key, "s"sv));
  624. generator.set("narrow_formats"sv, format_name(unit.key, "n"sv));
  625. generator.append(R"~~~(
  626. { @unit@, @long_formats@.span(), @short_formats@.span(), @narrow_formats@.span() },)~~~");
  627. }
  628. generator.append(R"~~~(
  629. } };
  630. )~~~");
  631. };
  632. generate_mapping(generator, locale_data.locales, "NumberSystem"sv, "s_number_systems"sv, "s_number_systems_{}", [&](auto const& name, auto const& value) { append_number_systems(name, value.number_systems); });
  633. generate_mapping(generator, locale_data.locales, "Unit"sv, "s_units"sv, "s_units_{}", [&](auto const& name, auto const& value) { append_units(name, value.units); });
  634. auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values) {
  635. HashValueMap<String> hashes;
  636. hashes.ensure_capacity(values.size());
  637. for (auto const& value : values)
  638. hashes.set(value.hash(), format_identifier(enum_title, value));
  639. generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes));
  640. };
  641. append_from_string("NumericSymbol"sv, "numeric_symbol"sv, locale_data.numeric_symbols);
  642. generator.append(R"~~~(
  643. static NumberSystem const* find_number_system(StringView locale, StringView system)
  644. {
  645. auto locale_value = locale_from_string(locale);
  646. if (!locale_value.has_value())
  647. return nullptr;
  648. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  649. auto const& number_systems = s_number_systems.at(locale_index);
  650. for (auto const& number_system : number_systems) {
  651. if (system == s_string_list[number_system.system])
  652. return &number_system;
  653. };
  654. return nullptr;
  655. }
  656. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, StringView symbol)
  657. {
  658. auto symbol_value = numeric_symbol_from_string(symbol);
  659. if (!symbol_value.has_value())
  660. return {};
  661. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  662. auto symbol_index = to_underlying(*symbol_value);
  663. return s_string_list[number_system->symbols[symbol_index]];
  664. }
  665. return {};
  666. }
  667. Optional<NumberGroupings> get_number_system_groupings(StringView locale, StringView system)
  668. {
  669. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr)
  670. return NumberGroupings { number_system->primary_grouping_size, number_system->secondary_grouping_size };
  671. return {};
  672. }
  673. Optional<Unicode::NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type)
  674. {
  675. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  676. @number_format_index_type@ format_index = 0;
  677. switch (type) {
  678. case StandardNumberFormatType::Decimal:
  679. format_index = number_system->decimal_format;
  680. break;
  681. case StandardNumberFormatType::Currency:
  682. format_index = number_system->currency_format;
  683. break;
  684. case StandardNumberFormatType::Accounting:
  685. format_index = number_system->accounting_format;
  686. break;
  687. case StandardNumberFormatType::Percent:
  688. format_index = number_system->percent_format;
  689. break;
  690. case StandardNumberFormatType::Scientific:
  691. format_index = number_system->scientific_format;
  692. break;
  693. }
  694. return s_number_formats[format_index].to_unicode_number_format();
  695. }
  696. return {};
  697. }
  698. Vector<Unicode::NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type)
  699. {
  700. Vector<Unicode::NumberFormat> formats;
  701. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  702. @number_format_list_index_type@ number_format_list_index { 0 };
  703. switch (type) {
  704. case CompactNumberFormatType::DecimalLong:
  705. number_format_list_index = number_system->decimal_long_formats;
  706. break;
  707. case CompactNumberFormatType::DecimalShort:
  708. number_format_list_index = number_system->decimal_short_formats;
  709. break;
  710. case CompactNumberFormatType::CurrencyUnit:
  711. number_format_list_index = number_system->currency_unit_formats;
  712. break;
  713. case CompactNumberFormatType::CurrencyShort:
  714. number_format_list_index = number_system->currency_short_formats;
  715. break;
  716. }
  717. auto number_formats = s_number_format_lists.at(number_format_list_index);
  718. formats.ensure_capacity(number_formats.size());
  719. for (auto number_format : number_formats)
  720. formats.append(s_number_formats[number_format].to_unicode_number_format());
  721. }
  722. return formats;
  723. }
  724. static Unit const* find_units(StringView locale, StringView unit)
  725. {
  726. auto locale_value = locale_from_string(locale);
  727. if (!locale_value.has_value())
  728. return nullptr;
  729. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  730. auto const& locale_units = s_units.at(locale_index);
  731. for (auto const& units : locale_units) {
  732. if (unit == s_string_list[units.unit])
  733. return &units;
  734. };
  735. return nullptr;
  736. }
  737. Vector<Unicode::NumberFormat> get_unit_formats(StringView locale, StringView unit, Style style)
  738. {
  739. Vector<Unicode::NumberFormat> formats;
  740. if (auto const* units = find_units(locale, unit); units != nullptr) {
  741. Span<@number_format_index_type@ const> number_formats;
  742. switch (style) {
  743. case Style::Long:
  744. number_formats = units->long_formats;
  745. break;
  746. case Style::Short:
  747. number_formats = units->short_formats;
  748. break;
  749. case Style::Narrow:
  750. number_formats = units->narrow_formats;
  751. break;
  752. default:
  753. VERIFY_NOT_REACHED();
  754. }
  755. formats.ensure_capacity(number_formats.size());
  756. for (auto number_format : number_formats)
  757. formats.append(s_number_formats[number_format].to_unicode_number_format());
  758. }
  759. return formats;
  760. }
  761. }
  762. )~~~");
  763. VERIFY(file.write(generator.as_string_view()));
  764. }
  765. ErrorOr<int> serenity_main(Main::Arguments arguments)
  766. {
  767. StringView generated_header_path = nullptr;
  768. StringView generated_implementation_path = nullptr;
  769. StringView numbers_path = nullptr;
  770. StringView units_path = nullptr;
  771. Core::ArgsParser args_parser;
  772. args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
  773. args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
  774. args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
  775. args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path");
  776. args_parser.parse(arguments);
  777. auto open_file = [&](StringView path) -> ErrorOr<NonnullRefPtr<Core::File>> {
  778. if (path.is_empty()) {
  779. args_parser.print_usage(stderr, arguments.argv[0]);
  780. return Error::from_string_literal("Must provide all command line options"sv);
  781. }
  782. return Core::File::open(path, Core::OpenMode::ReadWrite);
  783. };
  784. auto generated_header_file = TRY(open_file(generated_header_path));
  785. auto generated_implementation_file = TRY(open_file(generated_implementation_path));
  786. UnicodeLocaleData locale_data;
  787. TRY(parse_all_locales(numbers_path, units_path, locale_data));
  788. generate_unicode_locale_header(generated_header_file, locale_data);
  789. generate_unicode_locale_implementation(generated_implementation_file, locale_data);
  790. return 0;
  791. }