GenerateUnicodeNumberFormat.cpp 42 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057
  1. /*
  2. * Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "GeneratorUtil.h"
  7. #include <AK/AllOf.h>
  8. #include <AK/Array.h>
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Find.h>
  11. #include <AK/Format.h>
  12. #include <AK/HashFunctions.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/JsonObject.h>
  15. #include <AK/JsonParser.h>
  16. #include <AK/JsonValue.h>
  17. #include <AK/LexicalPath.h>
  18. #include <AK/QuickSort.h>
  19. #include <AK/SourceGenerator.h>
  20. #include <AK/String.h>
  21. #include <AK/StringBuilder.h>
  22. #include <AK/Traits.h>
  23. #include <AK/Utf8View.h>
  24. #include <LibCore/ArgsParser.h>
  25. #include <LibCore/DirIterator.h>
  26. #include <LibCore/File.h>
  27. #include <LibUnicode/Locale.h>
  28. #include <LibUnicode/NumberFormat.h>
  29. #include <math.h>
  30. using StringIndexType = u16;
  31. constexpr auto s_string_index_type = "u16"sv;
  32. using NumberFormatIndexType = u16;
  33. constexpr auto s_number_format_index_type = "u16"sv;
  34. using NumberFormatListIndexType = u16;
  35. constexpr auto s_number_format_list_index_type = "u16"sv;
  36. using NumericSymbolListIndexType = u8;
  37. constexpr auto s_numeric_symbol_list_index_type = "u8"sv;
  38. using NumberSystemIndexType = u8;
  39. constexpr auto s_number_system_index_type = "u8"sv;
  40. using UnitIndexType = u16;
  41. constexpr auto s_unit_index_type = "u16"sv;
  42. enum class NumberFormatType {
  43. Standard,
  44. Compact,
  45. };
  46. struct NumberFormat : public Unicode::NumberFormat {
  47. using Base = Unicode::NumberFormat;
  48. static Base::Plurality plurality_from_string(StringView plurality)
  49. {
  50. if (plurality == "other"sv)
  51. return Base::Plurality::Other;
  52. if (plurality == "1"sv)
  53. return Base::Plurality::Single;
  54. if (plurality == "zero"sv)
  55. return Base::Plurality::Zero;
  56. if (plurality == "one"sv)
  57. return Base::Plurality::One;
  58. if (plurality == "two"sv)
  59. return Base::Plurality::Two;
  60. if (plurality == "few"sv)
  61. return Base::Plurality::Few;
  62. if (plurality == "many"sv)
  63. return Base::Plurality::Many;
  64. VERIFY_NOT_REACHED();
  65. }
  66. unsigned hash() const
  67. {
  68. auto hash = pair_int_hash(magnitude, exponent);
  69. hash = pair_int_hash(hash, static_cast<u8>(plurality));
  70. hash = pair_int_hash(hash, zero_format_index);
  71. hash = pair_int_hash(hash, positive_format_index);
  72. hash = pair_int_hash(hash, negative_format_index);
  73. for (auto index : identifier_indices)
  74. hash = pair_int_hash(hash, index);
  75. return hash;
  76. }
  77. bool operator==(NumberFormat const& other) const
  78. {
  79. return (magnitude == other.magnitude)
  80. && (exponent == other.exponent)
  81. && (plurality == other.plurality)
  82. && (zero_format_index == other.zero_format_index)
  83. && (positive_format_index == other.positive_format_index)
  84. && (negative_format_index == other.negative_format_index)
  85. && (identifier_indices == other.identifier_indices);
  86. }
  87. StringIndexType zero_format_index { 0 };
  88. StringIndexType positive_format_index { 0 };
  89. StringIndexType negative_format_index { 0 };
  90. Vector<StringIndexType> identifier_indices {};
  91. };
  92. template<>
  93. struct AK::Formatter<NumberFormat> : Formatter<FormatString> {
  94. ErrorOr<void> format(FormatBuilder& builder, NumberFormat const& format)
  95. {
  96. StringBuilder identifier_indices;
  97. identifier_indices.join(", "sv, format.identifier_indices);
  98. return Formatter<FormatString>::format(builder,
  99. "{{ {}, {}, {}, {}, {}, {}, {{ {} }} }}",
  100. format.magnitude,
  101. format.exponent,
  102. static_cast<u8>(format.plurality),
  103. format.zero_format_index,
  104. format.positive_format_index,
  105. format.negative_format_index,
  106. identifier_indices.build());
  107. }
  108. };
  109. template<>
  110. struct AK::Traits<NumberFormat> : public GenericTraits<NumberFormat> {
  111. static unsigned hash(NumberFormat const& f) { return f.hash(); }
  112. };
  113. using NumberFormatList = Vector<NumberFormatIndexType>;
  114. template<>
  115. struct AK::Traits<NumberFormatList> : public GenericTraits<NumberFormatList> {
  116. static unsigned hash(NumberFormatList const& formats)
  117. {
  118. auto hash = int_hash(static_cast<u32>(formats.size()));
  119. for (auto format : formats)
  120. hash = pair_int_hash(hash, format);
  121. return hash;
  122. }
  123. };
  124. using NumericSymbolList = Vector<StringIndexType>;
  125. struct NumberSystem {
  126. unsigned hash() const
  127. {
  128. auto hash = pair_int_hash(system, symbols);
  129. hash = pair_int_hash(hash, primary_grouping_size);
  130. hash = pair_int_hash(hash, secondary_grouping_size);
  131. hash = pair_int_hash(hash, decimal_format);
  132. hash = pair_int_hash(hash, decimal_long_formats);
  133. hash = pair_int_hash(hash, decimal_short_formats);
  134. hash = pair_int_hash(hash, currency_format);
  135. hash = pair_int_hash(hash, accounting_format);
  136. hash = pair_int_hash(hash, currency_unit_formats);
  137. hash = pair_int_hash(hash, currency_short_formats);
  138. hash = pair_int_hash(hash, percent_format);
  139. hash = pair_int_hash(hash, scientific_format);
  140. return hash;
  141. }
  142. bool operator==(NumberSystem const& other) const
  143. {
  144. return (system == other.system)
  145. && (symbols == other.symbols)
  146. && (primary_grouping_size == other.primary_grouping_size)
  147. && (secondary_grouping_size == other.secondary_grouping_size)
  148. && (decimal_format == other.decimal_format)
  149. && (decimal_long_formats == other.decimal_long_formats)
  150. && (decimal_short_formats == other.decimal_short_formats)
  151. && (currency_format == other.currency_format)
  152. && (accounting_format == other.accounting_format)
  153. && (currency_unit_formats == other.currency_unit_formats)
  154. && (currency_short_formats == other.currency_short_formats)
  155. && (percent_format == other.percent_format)
  156. && (scientific_format == other.scientific_format);
  157. }
  158. StringIndexType system { 0 };
  159. NumericSymbolListIndexType symbols { 0 };
  160. u8 primary_grouping_size { 0 };
  161. u8 secondary_grouping_size { 0 };
  162. NumberFormatIndexType decimal_format { 0 };
  163. NumberFormatListIndexType decimal_long_formats { 0 };
  164. NumberFormatListIndexType decimal_short_formats { 0 };
  165. NumberFormatIndexType currency_format { 0 };
  166. NumberFormatIndexType accounting_format { 0 };
  167. NumberFormatListIndexType currency_unit_formats { 0 };
  168. NumberFormatListIndexType currency_short_formats { 0 };
  169. NumberFormatIndexType percent_format { 0 };
  170. NumberFormatIndexType scientific_format { 0 };
  171. };
  172. template<>
  173. struct AK::Formatter<NumberSystem> : Formatter<FormatString> {
  174. ErrorOr<void> format(FormatBuilder& builder, NumberSystem const& system)
  175. {
  176. return Formatter<FormatString>::format(builder,
  177. "{{ {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {} }}",
  178. system.system,
  179. system.symbols,
  180. system.primary_grouping_size,
  181. system.secondary_grouping_size,
  182. system.decimal_format,
  183. system.decimal_long_formats,
  184. system.decimal_short_formats,
  185. system.currency_format,
  186. system.accounting_format,
  187. system.currency_unit_formats,
  188. system.currency_short_formats,
  189. system.percent_format,
  190. system.scientific_format);
  191. }
  192. };
  193. template<>
  194. struct AK::Traits<NumberSystem> : public GenericTraits<NumberSystem> {
  195. static unsigned hash(NumberSystem const& s) { return s.hash(); }
  196. };
  197. struct Unit {
  198. unsigned hash() const
  199. {
  200. auto hash = int_hash(unit);
  201. hash = pair_int_hash(hash, long_formats);
  202. hash = pair_int_hash(hash, short_formats);
  203. hash = pair_int_hash(hash, narrow_formats);
  204. return hash;
  205. }
  206. bool operator==(Unit const& other) const
  207. {
  208. return (unit == other.unit)
  209. && (long_formats == other.long_formats)
  210. && (short_formats == other.short_formats)
  211. && (narrow_formats == other.narrow_formats);
  212. }
  213. StringIndexType unit { 0 };
  214. NumberFormatListIndexType long_formats { 0 };
  215. NumberFormatListIndexType short_formats { 0 };
  216. NumberFormatListIndexType narrow_formats { 0 };
  217. };
  218. template<>
  219. struct AK::Formatter<Unit> : Formatter<FormatString> {
  220. ErrorOr<void> format(FormatBuilder& builder, Unit const& system)
  221. {
  222. return Formatter<FormatString>::format(builder,
  223. "{{ {}, {}, {}, {} }}",
  224. system.unit,
  225. system.long_formats,
  226. system.short_formats,
  227. system.narrow_formats);
  228. }
  229. };
  230. template<>
  231. struct AK::Traits<Unit> : public GenericTraits<Unit> {
  232. static unsigned hash(Unit const& u) { return u.hash(); }
  233. };
  234. struct Locale {
  235. HashMap<String, NumberSystemIndexType> number_systems;
  236. HashMap<String, UnitIndexType> units {};
  237. };
  238. struct UnicodeLocaleData {
  239. UniqueStringStorage<StringIndexType> unique_strings;
  240. UniqueStorage<NumberFormat, NumberFormatIndexType> unique_formats;
  241. UniqueStorage<NumberFormatList, NumberFormatListIndexType> unique_format_lists;
  242. UniqueStorage<NumericSymbolList, NumericSymbolListIndexType> unique_symbols;
  243. UniqueStorage<NumberSystem, NumberSystemIndexType> unique_systems;
  244. UniqueStorage<Unit, UnitIndexType> unique_units;
  245. HashMap<String, Locale> locales;
  246. size_t max_identifier_count { 0 };
  247. };
  248. static String parse_identifiers(String pattern, StringView replacement, UnicodeLocaleData& locale_data, NumberFormat& format)
  249. {
  250. static Utf8View whitespace { "\u0020\u00a0\u200f"sv };
  251. while (true) {
  252. Utf8View utf8_pattern { pattern };
  253. Optional<size_t> start_index;
  254. Optional<size_t> end_index;
  255. bool inside_replacement = false;
  256. for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
  257. if (*it == '{') {
  258. if (start_index.has_value()) {
  259. end_index = utf8_pattern.byte_offset_of(it);
  260. break;
  261. }
  262. inside_replacement = true;
  263. } else if (*it == '}') {
  264. inside_replacement = false;
  265. } else if (!inside_replacement && !start_index.has_value() && !whitespace.contains(*it)) {
  266. start_index = utf8_pattern.byte_offset_of(it);
  267. }
  268. }
  269. if (!start_index.has_value())
  270. return pattern;
  271. end_index = end_index.value_or(pattern.length());
  272. utf8_pattern = utf8_pattern.substring_view(*start_index, *end_index - *start_index);
  273. utf8_pattern = utf8_pattern.trim(whitespace);
  274. auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv);
  275. auto identifier_index = locale_data.unique_strings.ensure(move(identifier));
  276. size_t replacement_index = 0;
  277. if (auto index = format.identifier_indices.find_first_index(identifier_index); index.has_value()) {
  278. replacement_index = *index;
  279. } else {
  280. replacement_index = format.identifier_indices.size();
  281. format.identifier_indices.append(identifier_index);
  282. locale_data.max_identifier_count = max(locale_data.max_identifier_count, format.identifier_indices.size());
  283. }
  284. pattern = String::formatted("{}{{{}:{}}}{}",
  285. *start_index > 0 ? pattern.substring_view(0, *start_index) : ""sv,
  286. replacement,
  287. replacement_index,
  288. pattern.substring_view(*start_index + utf8_pattern.byte_length()));
  289. }
  290. }
  291. static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
  292. {
  293. // https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
  294. // https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
  295. VERIFY((patterns.size() == 1) || (patterns.size() == 2));
  296. auto replace_patterns = [&](String pattern) {
  297. static HashMap<StringView, StringView> replacements = {
  298. { "{0}"sv, "{number}"sv },
  299. { "{1}"sv, "{currency}"sv },
  300. { "%"sv, "{percentSign}"sv },
  301. { "+"sv, "{plusSign}"sv },
  302. { "-"sv, "{minusSign}"sv },
  303. { "¤"sv, "{currency}"sv }, // U+00A4 Currency Sign
  304. { "E"sv, "{scientificSeparator}"sv },
  305. };
  306. for (auto const& replacement : replacements)
  307. pattern = pattern.replace(replacement.key, replacement.value, true);
  308. if (auto start_number_index = pattern.find_any_of("#0"sv, String::SearchDirection::Forward); start_number_index.has_value()) {
  309. auto end_number_index = *start_number_index + 1;
  310. for (; end_number_index < pattern.length(); ++end_number_index) {
  311. auto ch = pattern[end_number_index];
  312. if ((ch != '#') && (ch != '0') && (ch != ',') && (ch != '.'))
  313. break;
  314. }
  315. if (number_system_for_groupings) {
  316. auto number_pattern = pattern.substring_view(*start_number_index, end_number_index - *start_number_index);
  317. auto group_separators = number_pattern.find_all(","sv);
  318. VERIFY((group_separators.size() == 1) || (group_separators.size() == 2));
  319. auto decimal = number_pattern.find('.');
  320. VERIFY(decimal.has_value());
  321. if (group_separators.size() == 1) {
  322. number_system_for_groupings->primary_grouping_size = *decimal - group_separators[0] - 1;
  323. number_system_for_groupings->secondary_grouping_size = number_system_for_groupings->primary_grouping_size;
  324. } else {
  325. number_system_for_groupings->primary_grouping_size = *decimal - group_separators[1] - 1;
  326. number_system_for_groupings->secondary_grouping_size = group_separators[1] - group_separators[0] - 1;
  327. }
  328. }
  329. pattern = String::formatted("{}{{number}}{}",
  330. *start_number_index > 0 ? pattern.substring_view(0, *start_number_index) : ""sv,
  331. pattern.substring_view(end_number_index));
  332. // This is specifically handled here rather than in the replacements HashMap above so
  333. // that we do not errantly replace zeroes in number patterns.
  334. if (pattern.contains(*replacements.get("E"sv)))
  335. pattern = pattern.replace("0"sv, "{scientificExponent}"sv);
  336. }
  337. if (type == NumberFormatType::Compact)
  338. return parse_identifiers(move(pattern), "compactIdentifier"sv, locale_data, format);
  339. return pattern;
  340. };
  341. auto zero_format = replace_patterns(move(patterns[0]));
  342. format.positive_format_index = locale_data.unique_strings.ensure(String::formatted("{{plusSign}}{}", zero_format));
  343. if (patterns.size() == 2) {
  344. auto negative_format = replace_patterns(move(patterns[1]));
  345. format.negative_format_index = locale_data.unique_strings.ensure(move(negative_format));
  346. } else {
  347. format.negative_format_index = locale_data.unique_strings.ensure(String::formatted("{{minusSign}}{}", zero_format));
  348. }
  349. format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format));
  350. }
  351. static void parse_number_pattern(Vector<String> patterns, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormatIndexType& format_index, NumberSystem* number_system_for_groupings = nullptr)
  352. {
  353. NumberFormat format {};
  354. parse_number_pattern(move(patterns), locale_data, type, format, number_system_for_groupings);
  355. format_index = locale_data.unique_formats.ensure(move(format));
  356. }
  357. static ErrorOr<void> parse_number_systems(String locale_numbers_path, UnicodeLocaleData& locale_data, Locale& locale)
  358. {
  359. LexicalPath numbers_path(move(locale_numbers_path));
  360. numbers_path = numbers_path.append("numbers.json"sv);
  361. auto numbers_file = TRY(Core::File::open(numbers_path.string(), Core::OpenMode::ReadOnly));
  362. auto numbers = TRY(JsonValue::from_string(numbers_file->read_all()));
  363. auto const& main_object = numbers.as_object().get("main"sv);
  364. auto const& locale_object = main_object.as_object().get(numbers_path.parent().basename());
  365. auto const& locale_numbers_object = locale_object.as_object().get("numbers"sv);
  366. HashMap<String, NumberSystem> number_systems;
  367. auto ensure_number_system = [&](auto const& system) -> NumberSystem& {
  368. return number_systems.ensure(system, [&]() {
  369. auto system_index = locale_data.unique_strings.ensure(system);
  370. return NumberSystem { .system = system_index };
  371. });
  372. };
  373. auto parse_number_format = [&](auto const& format_object) {
  374. Vector<NumberFormatIndexType> result;
  375. result.ensure_capacity(format_object.size());
  376. format_object.for_each_member([&](auto const& key, JsonValue const& value) {
  377. auto split_key = key.split_view('-');
  378. if (split_key.size() != 3)
  379. return;
  380. auto patterns = value.as_string().split(';');
  381. NumberFormat format {};
  382. if (auto type = split_key[0].template to_uint<u64>(); type.has_value()) {
  383. VERIFY(*type % 10 == 0);
  384. format.magnitude = static_cast<u8>(log10(*type));
  385. if (patterns[0] != "0"sv) {
  386. auto number_of_zeroes_in_pattern = patterns[0].count("0"sv);
  387. VERIFY(format.magnitude >= number_of_zeroes_in_pattern);
  388. format.exponent = format.magnitude + 1 - number_of_zeroes_in_pattern;
  389. }
  390. } else {
  391. VERIFY(split_key[0] == "unitPattern"sv);
  392. }
  393. format.plurality = NumberFormat::plurality_from_string(split_key[2]);
  394. parse_number_pattern(move(patterns), locale_data, NumberFormatType::Compact, format);
  395. auto format_index = locale_data.unique_formats.ensure(move(format));
  396. result.append(format_index);
  397. });
  398. return locale_data.unique_format_lists.ensure(move(result));
  399. };
  400. auto numeric_symbol_from_string = [&](StringView numeric_symbol) -> Optional<Unicode::NumericSymbol> {
  401. if (numeric_symbol == "decimal"sv)
  402. return Unicode::NumericSymbol::Decimal;
  403. if (numeric_symbol == "exponential"sv)
  404. return Unicode::NumericSymbol::Exponential;
  405. if (numeric_symbol == "group"sv)
  406. return Unicode::NumericSymbol::Group;
  407. if (numeric_symbol == "infinity"sv)
  408. return Unicode::NumericSymbol::Infinity;
  409. if (numeric_symbol == "minusSign"sv)
  410. return Unicode::NumericSymbol::MinusSign;
  411. if (numeric_symbol == "nan"sv)
  412. return Unicode::NumericSymbol::NaN;
  413. if (numeric_symbol == "percentSign"sv)
  414. return Unicode::NumericSymbol::PercentSign;
  415. if (numeric_symbol == "plusSign"sv)
  416. return Unicode::NumericSymbol::PlusSign;
  417. return {};
  418. };
  419. locale_numbers_object.as_object().for_each_member([&](auto const& key, JsonValue const& value) {
  420. constexpr auto symbols_prefix = "symbols-numberSystem-"sv;
  421. constexpr auto decimal_formats_prefix = "decimalFormats-numberSystem-"sv;
  422. constexpr auto currency_formats_prefix = "currencyFormats-numberSystem-"sv;
  423. constexpr auto percent_formats_prefix = "percentFormats-numberSystem-"sv;
  424. constexpr auto scientific_formats_prefix = "scientificFormats-numberSystem-"sv;
  425. if (key.starts_with(symbols_prefix)) {
  426. auto system = key.substring(symbols_prefix.length());
  427. auto& number_system = ensure_number_system(system);
  428. NumericSymbolList symbols;
  429. value.as_object().for_each_member([&](auto const& symbol, JsonValue const& localization) {
  430. auto numeric_symbol = numeric_symbol_from_string(symbol);
  431. if (!numeric_symbol.has_value())
  432. return;
  433. if (to_underlying(*numeric_symbol) >= symbols.size())
  434. symbols.resize(to_underlying(*numeric_symbol) + 1);
  435. auto symbol_index = locale_data.unique_strings.ensure(localization.as_string());
  436. symbols[to_underlying(*numeric_symbol)] = symbol_index;
  437. });
  438. number_system.symbols = locale_data.unique_symbols.ensure(move(symbols));
  439. } else if (key.starts_with(decimal_formats_prefix)) {
  440. auto system = key.substring(decimal_formats_prefix.length());
  441. auto& number_system = ensure_number_system(system);
  442. auto format_object = value.as_object().get("standard"sv);
  443. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.decimal_format, &number_system);
  444. auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv);
  445. number_system.decimal_long_formats = parse_number_format(long_format.as_object());
  446. auto const& short_format = value.as_object().get("short"sv).as_object().get("decimalFormat"sv);
  447. number_system.decimal_short_formats = parse_number_format(short_format.as_object());
  448. } else if (key.starts_with(currency_formats_prefix)) {
  449. auto system = key.substring(currency_formats_prefix.length());
  450. auto& number_system = ensure_number_system(system);
  451. auto format_object = value.as_object().get("standard"sv);
  452. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.currency_format);
  453. format_object = value.as_object().get("accounting"sv);
  454. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.accounting_format);
  455. number_system.currency_unit_formats = parse_number_format(value.as_object());
  456. if (value.as_object().has("short"sv)) {
  457. auto const& short_format = value.as_object().get("short"sv).as_object().get("standard"sv);
  458. number_system.currency_short_formats = parse_number_format(short_format.as_object());
  459. }
  460. } else if (key.starts_with(percent_formats_prefix)) {
  461. auto system = key.substring(percent_formats_prefix.length());
  462. auto& number_system = ensure_number_system(system);
  463. auto format_object = value.as_object().get("standard"sv);
  464. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.percent_format);
  465. } else if (key.starts_with(scientific_formats_prefix)) {
  466. auto system = key.substring(scientific_formats_prefix.length());
  467. auto& number_system = ensure_number_system(system);
  468. auto format_object = value.as_object().get("standard"sv);
  469. parse_number_pattern(format_object.as_string().split(';'), locale_data, NumberFormatType::Standard, number_system.scientific_format);
  470. }
  471. });
  472. for (auto& number_system : number_systems) {
  473. auto system_index = locale_data.unique_systems.ensure(move(number_system.value));
  474. locale.number_systems.set(number_system.key, system_index);
  475. }
  476. return {};
  477. }
  478. static ErrorOr<void> parse_units(String locale_units_path, UnicodeLocaleData& locale_data, Locale& locale)
  479. {
  480. LexicalPath units_path(move(locale_units_path));
  481. units_path = units_path.append("units.json"sv);
  482. auto units_file = TRY(Core::File::open(units_path.string(), Core::OpenMode::ReadOnly));
  483. auto locale_units = TRY(JsonValue::from_string(units_file->read_all()));
  484. auto const& main_object = locale_units.as_object().get("main"sv);
  485. auto const& locale_object = main_object.as_object().get(units_path.parent().basename());
  486. auto const& locale_units_object = locale_object.as_object().get("units"sv);
  487. auto const& long_object = locale_units_object.as_object().get("long"sv);
  488. auto const& short_object = locale_units_object.as_object().get("short"sv);
  489. auto const& narrow_object = locale_units_object.as_object().get("narrow"sv);
  490. HashMap<String, Unit> units;
  491. auto ensure_unit = [&](auto const& unit) -> Unit& {
  492. return units.ensure(unit, [&]() {
  493. auto unit_index = locale_data.unique_strings.ensure(unit);
  494. return Unit { .unit = unit_index };
  495. });
  496. };
  497. auto is_sanctioned_unit = [](StringView unit_name) {
  498. // This is a copy of the units sanctioned for use within ECMA-402. LibUnicode generally tries to
  499. // avoid being directly dependent on ECMA-402, but this rather significantly reduces the amount
  500. // of data generated here, and ECMA-402 is currently the only consumer of this data.
  501. // https://tc39.es/ecma402/#table-sanctioned-simple-unit-identifiers
  502. constexpr auto sanctioned_units = AK::Array { "acre"sv, "bit"sv, "byte"sv, "celsius"sv, "centimeter"sv, "day"sv, "degree"sv, "fahrenheit"sv, "fluid-ounce"sv, "foot"sv, "gallon"sv, "gigabit"sv, "gigabyte"sv, "gram"sv, "hectare"sv, "hour"sv, "inch"sv, "kilobit"sv, "kilobyte"sv, "kilogram"sv, "kilometer"sv, "liter"sv, "megabit"sv, "megabyte"sv, "meter"sv, "mile"sv, "mile-scandinavian"sv, "milliliter"sv, "millimeter"sv, "millisecond"sv, "minute"sv, "month"sv, "ounce"sv, "percent"sv, "petabyte"sv, "pound"sv, "second"sv, "stone"sv, "terabit"sv, "terabyte"sv, "week"sv, "yard"sv, "year"sv };
  503. return find(sanctioned_units.begin(), sanctioned_units.end(), unit_name) != sanctioned_units.end();
  504. };
  505. auto parse_units_object = [&](auto const& units_object, Unicode::Style style) {
  506. constexpr auto unit_pattern_prefix = "unitPattern-count-"sv;
  507. constexpr auto combined_unit_separator = "-per-"sv;
  508. units_object.for_each_member([&](auto const& key, JsonValue const& value) {
  509. auto end_of_category = key.find('-');
  510. if (!end_of_category.has_value())
  511. return;
  512. auto unit_name = key.substring(*end_of_category + 1);
  513. if (!is_sanctioned_unit(unit_name)) {
  514. auto indices = unit_name.find_all(combined_unit_separator);
  515. if (indices.size() != 1)
  516. return;
  517. auto numerator = unit_name.substring_view(0, indices[0]);
  518. auto denominator = unit_name.substring_view(indices[0] + combined_unit_separator.length());
  519. if (!is_sanctioned_unit(numerator) || !is_sanctioned_unit(denominator))
  520. return;
  521. }
  522. auto& unit = ensure_unit(unit_name);
  523. NumberFormatList formats;
  524. value.as_object().for_each_member([&](auto const& unit_key, JsonValue const& pattern_value) {
  525. if (!unit_key.starts_with(unit_pattern_prefix))
  526. return;
  527. NumberFormat format {};
  528. auto plurality = unit_key.substring_view(unit_pattern_prefix.length());
  529. format.plurality = NumberFormat::plurality_from_string(plurality);
  530. auto zero_format = pattern_value.as_string().replace("{0}"sv, "{number}"sv);
  531. zero_format = parse_identifiers(zero_format, "unitIdentifier"sv, locale_data, format);
  532. format.positive_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{plusSign}{number}"sv));
  533. format.negative_format_index = locale_data.unique_strings.ensure(zero_format.replace("{number}"sv, "{minusSign}{number}"sv));
  534. format.zero_format_index = locale_data.unique_strings.ensure(move(zero_format));
  535. formats.append(locale_data.unique_formats.ensure(move(format)));
  536. });
  537. auto number_format_list_index = locale_data.unique_format_lists.ensure(move(formats));
  538. switch (style) {
  539. case Unicode::Style::Long:
  540. unit.long_formats = number_format_list_index;
  541. break;
  542. case Unicode::Style::Short:
  543. unit.short_formats = number_format_list_index;
  544. break;
  545. case Unicode::Style::Narrow:
  546. unit.narrow_formats = number_format_list_index;
  547. break;
  548. default:
  549. VERIFY_NOT_REACHED();
  550. }
  551. });
  552. };
  553. parse_units_object(long_object.as_object(), Unicode::Style::Long);
  554. parse_units_object(short_object.as_object(), Unicode::Style::Short);
  555. parse_units_object(narrow_object.as_object(), Unicode::Style::Narrow);
  556. for (auto& unit : units) {
  557. auto unit_index = locale_data.unique_units.ensure(move(unit.value));
  558. locale.units.set(unit.key, unit_index);
  559. }
  560. return {};
  561. }
  562. static ErrorOr<void> parse_all_locales(String numbers_path, String units_path, UnicodeLocaleData& locale_data)
  563. {
  564. auto numbers_iterator = TRY(path_to_dir_iterator(move(numbers_path)));
  565. auto units_iterator = TRY(path_to_dir_iterator(move(units_path)));
  566. auto remove_variants_from_path = [&](String path) -> ErrorOr<String> {
  567. auto parsed_locale = TRY(CanonicalLanguageID<StringIndexType>::parse(locale_data.unique_strings, LexicalPath::basename(path)));
  568. StringBuilder builder;
  569. builder.append(locale_data.unique_strings.get(parsed_locale.language));
  570. if (auto script = locale_data.unique_strings.get(parsed_locale.script); !script.is_empty())
  571. builder.appendff("-{}", script);
  572. if (auto region = locale_data.unique_strings.get(parsed_locale.region); !region.is_empty())
  573. builder.appendff("-{}", region);
  574. return builder.build();
  575. };
  576. while (numbers_iterator.has_next()) {
  577. auto numbers_path = TRY(next_path_from_dir_iterator(numbers_iterator));
  578. auto language = TRY(remove_variants_from_path(numbers_path));
  579. auto& locale = locale_data.locales.ensure(language);
  580. TRY(parse_number_systems(numbers_path, locale_data, locale));
  581. }
  582. while (units_iterator.has_next()) {
  583. auto units_path = TRY(next_path_from_dir_iterator(units_iterator));
  584. auto language = TRY(remove_variants_from_path(units_path));
  585. auto& locale = locale_data.locales.ensure(language);
  586. TRY(parse_units(units_path, locale_data, locale));
  587. }
  588. return {};
  589. }
  590. static void generate_unicode_locale_header(Core::File& file, UnicodeLocaleData&)
  591. {
  592. StringBuilder builder;
  593. SourceGenerator generator { builder };
  594. generator.append(R"~~~(
  595. #pragma once
  596. #include <AK/Optional.h>
  597. #include <AK/StringView.h>
  598. #include <AK/Vector.h>
  599. #include <LibUnicode/Forward.h>
  600. namespace Unicode {
  601. )~~~");
  602. generator.append(R"~~~(
  603. namespace Detail {
  604. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, Unicode::NumericSymbol symbol);
  605. Optional<NumberGroupings> get_number_system_groupings(StringView locale, StringView system);
  606. Optional<NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type);
  607. Vector<NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type);
  608. Vector<Unicode::NumberFormat> get_unit_formats(StringView locale, StringView unit, Style style);
  609. Optional<NumericSymbol> numeric_symbol_from_string(StringView numeric_symbol);
  610. }
  611. }
  612. )~~~");
  613. VERIFY(file.write(generator.as_string_view()));
  614. }
  615. static void generate_unicode_locale_implementation(Core::File& file, UnicodeLocaleData& locale_data)
  616. {
  617. StringBuilder builder;
  618. SourceGenerator generator { builder };
  619. generator.set("string_index_type"sv, s_string_index_type);
  620. generator.set("number_format_index_type"sv, s_number_format_index_type);
  621. generator.set("number_format_list_index_type"sv, s_number_format_list_index_type);
  622. generator.set("numeric_symbol_list_index_type"sv, s_numeric_symbol_list_index_type);
  623. generator.set("identifier_count", String::number(locale_data.max_identifier_count));
  624. generator.append(R"~~~(
  625. #include <AK/Array.h>
  626. #include <AK/BinarySearch.h>
  627. #include <AK/Span.h>
  628. #include <LibUnicode/Locale.h>
  629. #include <LibUnicode/NumberFormat.h>
  630. #include <LibUnicode/UnicodeNumberFormat.h>
  631. namespace Unicode::Detail {
  632. )~~~");
  633. locale_data.unique_strings.generate(generator);
  634. generator.append(R"~~~(
  635. struct NumberFormat {
  636. Unicode::NumberFormat to_unicode_number_format() const {
  637. Unicode::NumberFormat number_format {};
  638. number_format.magnitude = magnitude;
  639. number_format.exponent = exponent;
  640. number_format.plurality = static_cast<Unicode::NumberFormat::Plurality>(plurality);
  641. number_format.zero_format = s_string_list[zero_format];
  642. number_format.positive_format = s_string_list[positive_format];
  643. number_format.negative_format = s_string_list[negative_format];
  644. number_format.identifiers.ensure_capacity(identifiers.size());
  645. for (@string_index_type@ identifier : identifiers)
  646. number_format.identifiers.append(s_string_list[identifier]);
  647. return number_format;
  648. }
  649. u8 magnitude { 0 };
  650. u8 exponent { 0 };
  651. u8 plurality { 0 };
  652. @string_index_type@ zero_format { 0 };
  653. @string_index_type@ positive_format { 0 };
  654. @string_index_type@ negative_format { 0 };
  655. Array<@string_index_type@, @identifier_count@> identifiers {};
  656. };
  657. struct NumberSystem {
  658. @string_index_type@ system { 0 };
  659. @numeric_symbol_list_index_type@ symbols { 0 };
  660. u8 primary_grouping_size { 0 };
  661. u8 secondary_grouping_size { 0 };
  662. @number_format_index_type@ decimal_format { 0 };
  663. @number_format_list_index_type@ decimal_long_formats { 0 };
  664. @number_format_list_index_type@ decimal_short_formats { 0 };
  665. @number_format_index_type@ currency_format { 0 };
  666. @number_format_index_type@ accounting_format { 0 };
  667. @number_format_list_index_type@ currency_unit_formats { 0 };
  668. @number_format_list_index_type@ currency_short_formats { 0 };
  669. @number_format_index_type@ percent_format { 0 };
  670. @number_format_index_type@ scientific_format { 0 };
  671. };
  672. struct Unit {
  673. @string_index_type@ unit { 0 };
  674. @number_format_list_index_type@ long_formats { 0 };
  675. @number_format_list_index_type@ short_formats { 0 };
  676. @number_format_list_index_type@ narrow_formats { 0 };
  677. };
  678. )~~~");
  679. locale_data.unique_formats.generate(generator, "NumberFormat"sv, "s_number_formats"sv, 10);
  680. locale_data.unique_format_lists.generate(generator, s_number_format_index_type, "s_number_format_lists"sv);
  681. locale_data.unique_symbols.generate(generator, s_string_index_type, "s_numeric_symbol_lists"sv);
  682. locale_data.unique_systems.generate(generator, "NumberSystem"sv, "s_number_systems"sv, 10);
  683. locale_data.unique_units.generate(generator, "Unit"sv, "s_units"sv, 10);
  684. auto append_map = [&](String name, auto type, auto const& map) {
  685. generator.set("name", name);
  686. generator.set("type", type);
  687. generator.set("size", String::number(map.size()));
  688. generator.append(R"~~~(
  689. static constexpr Array<@type@, @size@> @name@ { {)~~~");
  690. bool first = true;
  691. for (auto const& item : map) {
  692. generator.append(first ? " " : ", ");
  693. generator.append(String::number(item.value));
  694. first = false;
  695. }
  696. generator.append(" } };");
  697. };
  698. generate_mapping(generator, locale_data.locales, s_number_system_index_type, "s_locale_number_systems"sv, "s_number_systems_{}", [&](auto const& name, auto const& value) { append_map(name, s_number_system_index_type, value.number_systems); });
  699. generate_mapping(generator, locale_data.locales, s_unit_index_type, "s_locale_units"sv, "s_units_{}", [&](auto const& name, auto const& value) { append_map(name, s_unit_index_type, value.units); });
  700. generator.append(R"~~~(
  701. static NumberSystem const* find_number_system(StringView locale, StringView system)
  702. {
  703. auto locale_value = locale_from_string(locale);
  704. if (!locale_value.has_value())
  705. return nullptr;
  706. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  707. auto const& number_systems = s_locale_number_systems.at(locale_index);
  708. for (auto system_index : number_systems) {
  709. auto const& number_system = s_number_systems.at(system_index);
  710. if (system == s_string_list[number_system.system])
  711. return &number_system;
  712. };
  713. return nullptr;
  714. }
  715. Optional<StringView> get_number_system_symbol(StringView locale, StringView system, Unicode::NumericSymbol symbol)
  716. {
  717. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  718. auto symbols = s_numeric_symbol_lists.at(number_system->symbols);
  719. auto symbol_index = to_underlying(symbol);
  720. if (symbol_index >= symbols.size())
  721. return {};
  722. return s_string_list[symbols[symbol_index]];
  723. }
  724. return {};
  725. }
  726. Optional<NumberGroupings> get_number_system_groupings(StringView locale, StringView system)
  727. {
  728. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr)
  729. return NumberGroupings { number_system->primary_grouping_size, number_system->secondary_grouping_size };
  730. return {};
  731. }
  732. Optional<Unicode::NumberFormat> get_standard_number_system_format(StringView locale, StringView system, StandardNumberFormatType type)
  733. {
  734. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  735. @number_format_index_type@ format_index = 0;
  736. switch (type) {
  737. case StandardNumberFormatType::Decimal:
  738. format_index = number_system->decimal_format;
  739. break;
  740. case StandardNumberFormatType::Currency:
  741. format_index = number_system->currency_format;
  742. break;
  743. case StandardNumberFormatType::Accounting:
  744. format_index = number_system->accounting_format;
  745. break;
  746. case StandardNumberFormatType::Percent:
  747. format_index = number_system->percent_format;
  748. break;
  749. case StandardNumberFormatType::Scientific:
  750. format_index = number_system->scientific_format;
  751. break;
  752. }
  753. return s_number_formats[format_index].to_unicode_number_format();
  754. }
  755. return {};
  756. }
  757. Vector<Unicode::NumberFormat> get_compact_number_system_formats(StringView locale, StringView system, CompactNumberFormatType type)
  758. {
  759. Vector<Unicode::NumberFormat> formats;
  760. if (auto const* number_system = find_number_system(locale, system); number_system != nullptr) {
  761. @number_format_list_index_type@ number_format_list_index { 0 };
  762. switch (type) {
  763. case CompactNumberFormatType::DecimalLong:
  764. number_format_list_index = number_system->decimal_long_formats;
  765. break;
  766. case CompactNumberFormatType::DecimalShort:
  767. number_format_list_index = number_system->decimal_short_formats;
  768. break;
  769. case CompactNumberFormatType::CurrencyUnit:
  770. number_format_list_index = number_system->currency_unit_formats;
  771. break;
  772. case CompactNumberFormatType::CurrencyShort:
  773. number_format_list_index = number_system->currency_short_formats;
  774. break;
  775. }
  776. auto number_formats = s_number_format_lists.at(number_format_list_index);
  777. formats.ensure_capacity(number_formats.size());
  778. for (auto number_format : number_formats)
  779. formats.append(s_number_formats[number_format].to_unicode_number_format());
  780. }
  781. return formats;
  782. }
  783. static Unit const* find_units(StringView locale, StringView unit)
  784. {
  785. auto locale_value = locale_from_string(locale);
  786. if (!locale_value.has_value())
  787. return nullptr;
  788. auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None.
  789. auto const& locale_units = s_locale_units.at(locale_index);
  790. for (auto unit_index : locale_units) {
  791. auto const& units = s_units.at(unit_index);
  792. if (unit == s_string_list[units.unit])
  793. return &units;
  794. };
  795. return nullptr;
  796. }
  797. Vector<Unicode::NumberFormat> get_unit_formats(StringView locale, StringView unit, Style style)
  798. {
  799. Vector<Unicode::NumberFormat> formats;
  800. if (auto const* units = find_units(locale, unit); units != nullptr) {
  801. @number_format_list_index_type@ number_format_list_index { 0 };
  802. switch (style) {
  803. case Style::Long:
  804. number_format_list_index = units->long_formats;
  805. break;
  806. case Style::Short:
  807. number_format_list_index = units->short_formats;
  808. break;
  809. case Style::Narrow:
  810. number_format_list_index = units->narrow_formats;
  811. break;
  812. default:
  813. VERIFY_NOT_REACHED();
  814. }
  815. auto number_formats = s_number_format_lists.at(number_format_list_index);
  816. formats.ensure_capacity(number_formats.size());
  817. for (auto number_format : number_formats)
  818. formats.append(s_number_formats[number_format].to_unicode_number_format());
  819. }
  820. return formats;
  821. }
  822. }
  823. )~~~");
  824. VERIFY(file.write(generator.as_string_view()));
  825. }
  826. ErrorOr<int> serenity_main(Main::Arguments arguments)
  827. {
  828. StringView generated_header_path = nullptr;
  829. StringView generated_implementation_path = nullptr;
  830. StringView numbers_path = nullptr;
  831. StringView units_path = nullptr;
  832. Core::ArgsParser args_parser;
  833. args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
  834. args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
  835. args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
  836. args_parser.add_option(units_path, "Path to cldr-units directory", "units-path", 'u', "units-path");
  837. args_parser.parse(arguments);
  838. auto open_file = [&](StringView path) -> ErrorOr<NonnullRefPtr<Core::File>> {
  839. if (path.is_empty()) {
  840. args_parser.print_usage(stderr, arguments.argv[0]);
  841. return Error::from_string_literal("Must provide all command line options"sv);
  842. }
  843. return Core::File::open(path, Core::OpenMode::ReadWrite);
  844. };
  845. auto generated_header_file = TRY(open_file(generated_header_path));
  846. auto generated_implementation_file = TRY(open_file(generated_implementation_path));
  847. UnicodeLocaleData locale_data;
  848. TRY(parse_all_locales(numbers_path, units_path, locale_data));
  849. generate_unicode_locale_header(generated_header_file, locale_data);
  850. generate_unicode_locale_implementation(generated_implementation_file, locale_data);
  851. return 0;
  852. }