GenerateEncodingIndexes.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. /*
  2. * Copyright (c) 2024, Simon Wanner <simon@skyrising.xyz>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Array.h>
  7. #include <AK/JsonObject.h>
  8. #include <AK/NumericLimits.h>
  9. #include <AK/SourceGenerator.h>
  10. #include <AK/StringBuilder.h>
  11. #include <AK/StringView.h>
  12. #include <AK/Vector.h>
  13. #include <LibCore/ArgsParser.h>
  14. #include <LibCore/File.h>
  15. #include <LibMain/Main.h>
  16. namespace {
  17. struct LookupTable {
  18. u32 first_pointer;
  19. u32 max_code_point;
  20. Vector<u32> code_points;
  21. bool generate_accessor;
  22. bool generate_inverse_accessor;
  23. };
  24. struct LookupTables {
  25. JsonArray const& gb18030_ranges;
  26. OrderedHashMap<StringView, LookupTable> indexes;
  27. };
  28. enum class GenerateAccessor {
  29. No,
  30. Yes,
  31. };
  32. enum class GenerateInverseAccessor {
  33. No,
  34. Yes,
  35. };
  36. LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No, GenerateInverseAccessor generate_inverse_accessor = GenerateInverseAccessor::No)
  37. {
  38. Vector<u32> code_points;
  39. code_points.ensure_capacity(data.size());
  40. u32 max = 0;
  41. u32 first_pointer = 0;
  42. for (auto const& entry : data.values()) {
  43. if (entry.is_null()) {
  44. if (code_points.is_empty()) {
  45. first_pointer++;
  46. } else {
  47. code_points.append(0xfffd);
  48. max = AK::max(max, code_points.last());
  49. }
  50. } else {
  51. code_points.append(entry.as_integer<u32>());
  52. max = AK::max(max, code_points.last());
  53. }
  54. }
  55. if (generate_accessor == GenerateAccessor::Yes) {
  56. while (code_points.last() == 0xfffd)
  57. code_points.take_last();
  58. } else {
  59. VERIFY(first_pointer == 0);
  60. }
  61. return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes, generate_inverse_accessor == GenerateInverseAccessor::Yes };
  62. }
  63. void generate_table(SourceGenerator generator, StringView name, LookupTable& table)
  64. {
  65. generator.set("name", name);
  66. generator.set("value_type", table.max_code_point > NumericLimits<u16>::max() ? "u32" : "u16");
  67. generator.set("first_pointer", String::number(table.first_pointer));
  68. generator.set("size", String::number(table.code_points.size()));
  69. if (table.first_pointer > 0) {
  70. generator.appendln("static constexpr u32 s_@name@_index_first_pointer = @first_pointer@;");
  71. }
  72. generator.append("static constexpr Array<@value_type@, @size@> s_@name@_index {\n ");
  73. for (size_t i = 0; i < table.code_points.size(); i++) {
  74. generator.append(MUST(String::formatted("{:#04x}", table.code_points[i])));
  75. if (i != table.code_points.size() - 1)
  76. generator.append(i % 16 == 15 ? ",\n "sv : ", "sv);
  77. }
  78. generator.appendln("\n};");
  79. if (table.generate_accessor)
  80. generator.appendln("Optional<u32> index_@name@_code_point(u32 pointer);");
  81. if (table.generate_inverse_accessor)
  82. generator.appendln("Optional<u32> code_point_@name@_index(u32 code_point);");
  83. }
  84. ErrorOr<void> generate_header_file(LookupTables& tables, Core::File& file)
  85. {
  86. StringBuilder builder;
  87. SourceGenerator generator { builder };
  88. generator.set("gb18030_ranges_size", String::number(tables.gb18030_ranges.size()));
  89. generator.append(R"~~~(
  90. #pragma once
  91. #include <AK/Array.h>
  92. #include <AK/Types.h>
  93. namespace TextCodec {
  94. struct Gb18030RangeEntry {
  95. u32 pointer;
  96. u32 code_point;
  97. };
  98. static constexpr Array<Gb18030RangeEntry, @gb18030_ranges_size@> s_gb18030_ranges { {
  99. )~~~");
  100. for (auto const& range : tables.gb18030_ranges.values()) {
  101. generator.appendln(MUST(String::formatted(" {{ {}, {:#04x} }},", range.as_array()[0].as_integer<u32>(), range.as_array()[1].as_integer<u32>())));
  102. }
  103. generator.appendln("} };\n");
  104. for (auto e : tables.indexes) {
  105. generate_table(generator.fork(), e.key, e.value);
  106. }
  107. generator.append("\n");
  108. generator.appendln("}");
  109. TRY(file.write_until_depleted(generator.as_string_view().bytes()));
  110. return {};
  111. }
  112. void generate_table_accessor(SourceGenerator generator, StringView name, LookupTable& table)
  113. {
  114. generator.set("name", name);
  115. generator.set("first_pointer", String::number(table.first_pointer));
  116. generator.set("size", String::number(table.code_points.size()));
  117. if (table.first_pointer > 0) {
  118. generator.append(R"~~~(
  119. Optional<u32> index_@name@_code_point(u32 pointer)
  120. {
  121. if (pointer < s_@name@_index_first_pointer || pointer - s_@name@_index_first_pointer >= s_@name@_index.size())
  122. return {};
  123. auto value = s_@name@_index[pointer - s_@name@_index_first_pointer];
  124. if (value == 0xfffd)
  125. return {};
  126. return value;
  127. }
  128. )~~~");
  129. } else {
  130. generator.append(R"~~~(
  131. Optional<u32> index_@name@_code_point(u32 pointer)
  132. {
  133. if (pointer >= s_@name@_index.size())
  134. return {};
  135. auto value = s_@name@_index[pointer];
  136. if (value == 0xfffd)
  137. return {};
  138. return value;
  139. }
  140. )~~~");
  141. }
  142. }
  143. void generate_inverse_table_accessor(SourceGenerator generator, StringView name, LookupTable& table)
  144. {
  145. generator.set("name", name);
  146. generator.set("first_pointer", String::number(table.first_pointer));
  147. generator.set("size", String::number(table.code_points.size()));
  148. // FIXME - Doing a linear search here is really slow, should be generating
  149. // some kind of reverse lookup table.
  150. if (table.first_pointer > 0) {
  151. generator.append(R"~~~(
  152. Optional<u32> code_point_@name@_index(u32 code_point)
  153. {
  154. for (u32 i = 0; i < s_@name@_index.size(); ++i) {
  155. if (s_@name@_index[i] == code_point) {
  156. return s_@name@_index_first_pointer + i;
  157. }
  158. }
  159. return {};
  160. }
  161. )~~~");
  162. } else {
  163. generator.append(R"~~~(
  164. Optional<u32> code_point_@name@_index(u32 code_point)
  165. {
  166. for (u32 i = 0; i < s_@name@_index.size(); ++i) {
  167. if (s_@name@_index[i] == code_point) {
  168. return i;
  169. }
  170. }
  171. return {};
  172. }
  173. )~~~");
  174. }
  175. }
  176. ErrorOr<void> generate_implementation_file(LookupTables& tables, Core::File& file)
  177. {
  178. StringBuilder builder;
  179. SourceGenerator generator { builder };
  180. generator.append(R"~~~(
  181. #include <LibTextCodec/LookupTables.h>
  182. namespace TextCodec {
  183. )~~~");
  184. for (auto& [key, table] : tables.indexes) {
  185. if (table.generate_accessor)
  186. generate_table_accessor(generator.fork(), key, table);
  187. if (table.generate_inverse_accessor)
  188. generate_inverse_table_accessor(generator.fork(), key, table);
  189. }
  190. generator.appendln("\n}");
  191. TRY(file.write_until_depleted(generator.as_string_view().bytes()));
  192. return {};
  193. }
  194. } // end anonymous namespace
  195. ErrorOr<int> serenity_main(Main::Arguments arguments)
  196. {
  197. StringView generated_header_path;
  198. StringView generated_implementation_path;
  199. StringView json_path;
  200. Core::ArgsParser args_parser;
  201. args_parser.add_option(generated_header_path, "Path to the lookup table header file to generate", "generated-header-path", 'h', "generated-header-path");
  202. args_parser.add_option(generated_implementation_path, "Path to the lookup table implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
  203. args_parser.add_option(json_path, "Path to the JSON file to read from", "json-path", 'j', "json-path");
  204. args_parser.parse(arguments);
  205. auto json_file = TRY(Core::File::open(json_path, Core::File::OpenMode::Read));
  206. auto json_data = TRY(json_file->read_until_eof());
  207. auto data = TRY(JsonValue::from_string(json_data)).as_object();
  208. auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes);
  209. // FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312)
  210. // NOTE: See https://commits.webkit.org/264918@main
  211. gb18030_table.code_points[7182] = 0xfe10;
  212. gb18030_table.code_points[7183] = 0xfe12;
  213. gb18030_table.code_points[7184] = 0xfe11;
  214. gb18030_table.code_points[7185] = 0xfe13;
  215. gb18030_table.code_points[7186] = 0xfe14;
  216. gb18030_table.code_points[7187] = 0xfe15;
  217. gb18030_table.code_points[7188] = 0xfe16;
  218. gb18030_table.code_points[7201] = 0xfe17;
  219. gb18030_table.code_points[7202] = 0xfe18;
  220. gb18030_table.code_points[7208] = 0xfe19;
  221. gb18030_table.code_points[23775] = 0x9fb4;
  222. gb18030_table.code_points[23783] = 0x9fb5;
  223. gb18030_table.code_points[23788] = 0x9fb6;
  224. gb18030_table.code_points[23789] = 0x9fb7;
  225. gb18030_table.code_points[23795] = 0x9fb8;
  226. gb18030_table.code_points[23812] = 0x9fb9;
  227. gb18030_table.code_points[23829] = 0x9fba;
  228. gb18030_table.code_points[23845] = 0x9fbb;
  229. LookupTables tables {
  230. .gb18030_ranges = data.get("gb18030-ranges"sv)->as_array(),
  231. .indexes = {
  232. { "gb18030"sv, move(gb18030_table) },
  233. { "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) },
  234. { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
  235. { "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) },
  236. { "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
  237. { "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) },
  238. { "iso_2022_jp_katakana"sv, prepare_table(data.get("iso-2022-jp-katakana"sv)->as_array(), GenerateAccessor::Yes) },
  239. { "iso_8859_2"sv, prepare_table(data.get("iso-8859-2"sv)->as_array()) },
  240. { "iso_8859_3"sv, prepare_table(data.get("iso-8859-3"sv)->as_array()) },
  241. { "iso_8859_4"sv, prepare_table(data.get("iso-8859-4"sv)->as_array()) },
  242. { "iso_8859_5"sv, prepare_table(data.get("iso-8859-5"sv)->as_array()) },
  243. { "iso_8859_6"sv, prepare_table(data.get("iso-8859-6"sv)->as_array()) },
  244. { "iso_8859_7"sv, prepare_table(data.get("iso-8859-7"sv)->as_array()) },
  245. { "iso_8859_8"sv, prepare_table(data.get("iso-8859-8"sv)->as_array()) },
  246. { "iso_8859_10"sv, prepare_table(data.get("iso-8859-10"sv)->as_array()) },
  247. { "iso_8859_13"sv, prepare_table(data.get("iso-8859-13"sv)->as_array()) },
  248. { "iso_8859_14"sv, prepare_table(data.get("iso-8859-14"sv)->as_array()) },
  249. { "iso_8859_15"sv, prepare_table(data.get("iso-8859-15"sv)->as_array()) },
  250. { "iso_8859_16"sv, prepare_table(data.get("iso-8859-16"sv)->as_array()) },
  251. { "koi8_r"sv, prepare_table(data.get("koi8-r"sv)->as_array()) },
  252. { "koi8_u"sv, prepare_table(data.get("koi8-u"sv)->as_array()) },
  253. { "macintosh"sv, prepare_table(data.get("macintosh"sv)->as_array()) },
  254. { "windows_874"sv, prepare_table(data.get("windows-874"sv)->as_array()) },
  255. { "windows_1250"sv, prepare_table(data.get("windows-1250"sv)->as_array()) },
  256. { "windows_1251"sv, prepare_table(data.get("windows-1251"sv)->as_array()) },
  257. { "windows_1252"sv, prepare_table(data.get("windows-1252"sv)->as_array()) },
  258. { "windows_1253"sv, prepare_table(data.get("windows-1253"sv)->as_array()) },
  259. { "windows_1254"sv, prepare_table(data.get("windows-1254"sv)->as_array()) },
  260. { "windows_1255"sv, prepare_table(data.get("windows-1255"sv)->as_array()) },
  261. { "windows_1256"sv, prepare_table(data.get("windows-1256"sv)->as_array()) },
  262. { "windows_1257"sv, prepare_table(data.get("windows-1257"sv)->as_array()) },
  263. { "windows_1258"sv, prepare_table(data.get("windows-1258"sv)->as_array()) },
  264. { "x_mac_cyrillic"sv, prepare_table(data.get("x-mac-cyrillic"sv)->as_array()) },
  265. },
  266. };
  267. auto generated_header_file = TRY(Core::File::open(generated_header_path, Core::File::OpenMode::Write));
  268. auto generated_implementation_file = TRY(Core::File::open(generated_implementation_path, Core::File::OpenMode::Write));
  269. TRY(generate_header_file(tables, *generated_header_file));
  270. TRY(generate_implementation_file(tables, *generated_implementation_file));
  271. return 0;
  272. }