GenerateEmojiData.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487
  1. /*
  2. * Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "GeneratorUtil.h"
  7. #include <AK/AnyOf.h>
  8. #include <AK/ByteString.h>
  9. #include <AK/QuickSort.h>
  10. #include <AK/SourceGenerator.h>
  11. #include <AK/StringUtils.h>
  12. #include <AK/Types.h>
  13. #include <LibCore/ArgsParser.h>
  14. #include <LibCore/Directory.h>
  15. #include <LibFileSystem/FileSystem.h>
  16. #include <LibUnicode/Emoji.h>
  17. struct Emoji {
  18. size_t name { 0 };
  19. Optional<size_t> image_path;
  20. Unicode::EmojiGroup group;
  21. ByteString subgroup;
  22. u32 display_order { 0 };
  23. Vector<u32> code_points;
  24. ByteString encoded_code_points;
  25. ByteString status;
  26. size_t code_point_array_index { 0 };
  27. };
  28. struct EmojiData {
  29. UniqueStringStorage unique_strings;
  30. Vector<Emoji> emojis;
  31. Vector<String> emoji_file_list;
  32. };
  33. static void set_image_path_for_emoji(StringView emoji_resource_path, EmojiData& emoji_data, Emoji& emoji)
  34. {
  35. StringBuilder builder;
  36. for (auto code_point : emoji.code_points) {
  37. if (code_point == 0xfe0f)
  38. continue;
  39. if (!builder.is_empty())
  40. builder.append('_');
  41. builder.appendff("U+{:X}", code_point);
  42. }
  43. auto file = ByteString::formatted("{}.png", builder.to_byte_string());
  44. auto path = ByteString::formatted("{}/{}", emoji_resource_path, file);
  45. if (!FileSystem::exists(path))
  46. return;
  47. emoji.image_path = emoji_data.unique_strings.ensure(move(file));
  48. }
  49. static ErrorOr<void> parse_emoji_test_data(Core::InputBufferedFile& file, EmojiData& emoji_data)
  50. {
  51. static constexpr auto group_header = "# group: "sv;
  52. static constexpr auto subgroup_header = "# subgroup: "sv;
  53. Array<u8, 1024> buffer;
  54. Unicode::EmojiGroup group;
  55. ByteString subgroup;
  56. u32 display_order { 0 };
  57. while (TRY(file.can_read_line())) {
  58. auto line = TRY(file.read_line(buffer));
  59. if (line.is_empty())
  60. continue;
  61. if (line.starts_with('#')) {
  62. if (line.starts_with(group_header)) {
  63. auto name = line.substring_view(group_header.length());
  64. group = Unicode::emoji_group_from_string(name);
  65. } else if (line.starts_with(subgroup_header)) {
  66. subgroup = line.substring_view(subgroup_header.length());
  67. }
  68. continue;
  69. }
  70. auto status_index = line.find(';');
  71. VERIFY(status_index.has_value());
  72. auto emoji_and_name_index = line.find('#', *status_index);
  73. VERIFY(emoji_and_name_index.has_value());
  74. Emoji emoji {};
  75. emoji.group = group;
  76. emoji.subgroup = subgroup;
  77. emoji.display_order = display_order++;
  78. auto code_points = line.substring_view(0, *status_index).split_view(' ');
  79. TRY(emoji.code_points.try_ensure_capacity(code_points.size()));
  80. for (auto code_point : code_points) {
  81. auto value = AK::StringUtils::convert_to_uint_from_hex<u32>(code_point);
  82. VERIFY(value.has_value());
  83. emoji.code_points.unchecked_append(*value);
  84. }
  85. auto emoji_and_name = line.substring_view(*emoji_and_name_index + 1);
  86. auto emoji_and_name_spaces = emoji_and_name.find_all(" "sv);
  87. VERIFY(emoji_and_name_spaces.size() > 2);
  88. auto name = emoji_and_name.substring_view(emoji_and_name_spaces[2]).trim_whitespace();
  89. emoji.name = emoji_data.unique_strings.ensure(name.to_titlecase_string());
  90. emoji.encoded_code_points = emoji_and_name.substring_view(0, emoji_and_name_spaces[1]).trim_whitespace();
  91. emoji.status = line.substring_view(*status_index + 1, *emoji_and_name_index - *status_index - 1).trim_whitespace();
  92. TRY(emoji_data.emojis.try_append(move(emoji)));
  93. }
  94. return {};
  95. }
  96. static ErrorOr<void> parse_emoji_serenity_data(Core::InputBufferedFile& file, EmojiData& emoji_data)
  97. {
  98. static constexpr auto code_point_header = "U+"sv;
  99. Array<u8, 1024> buffer;
  100. auto display_order = static_cast<u32>(emoji_data.emojis.size()) + 1u;
  101. while (TRY(file.can_read_line())) {
  102. auto line = TRY(file.read_line(buffer));
  103. if (line.is_empty())
  104. continue;
  105. auto index = line.find(code_point_header);
  106. if (!index.has_value())
  107. continue;
  108. line = line.substring_view(*index);
  109. StringBuilder builder;
  110. Emoji emoji {};
  111. emoji.group = Unicode::EmojiGroup::SerenityOS;
  112. emoji.display_order = display_order++;
  113. TRY(line.for_each_split_view(' ', SplitBehavior::Nothing, [&](auto segment) -> ErrorOr<void> {
  114. if (segment.starts_with(code_point_header)) {
  115. segment = segment.substring_view(code_point_header.length());
  116. auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segment);
  117. VERIFY(code_point.has_value());
  118. TRY(emoji.code_points.try_append(*code_point));
  119. } else {
  120. if (!builder.is_empty())
  121. TRY(builder.try_append(' '));
  122. TRY(builder.try_append(segment));
  123. }
  124. return {};
  125. }));
  126. auto name = builder.to_byte_string();
  127. if (!any_of(name, is_ascii_lower_alpha))
  128. name = name.to_titlecase();
  129. emoji.name = emoji_data.unique_strings.ensure(move(name));
  130. TRY(emoji_data.emojis.try_append(move(emoji)));
  131. }
  132. return {};
  133. }
  134. static ErrorOr<void> parse_emoji_file_list(Core::InputBufferedFile& file, EmojiData& emoji_data)
  135. {
  136. HashTable<String> seen_emojis;
  137. Array<u8, 1024> buffer;
  138. while (TRY(file.can_read_line())) {
  139. auto line = TRY(file.read_line(buffer));
  140. if (line.is_empty())
  141. continue;
  142. if (seen_emojis.contains(line)) {
  143. warnln("\x1b[1;31mError!\x1b[0m Duplicate emoji \x1b[35m{}\x1b[0m listed in emoji-file-list.txt.", line);
  144. return Error::from_errno(EEXIST);
  145. }
  146. emoji_data.emoji_file_list.append(TRY(String::from_utf8(line)));
  147. seen_emojis.set(emoji_data.emoji_file_list.last());
  148. }
  149. return {};
  150. }
  151. static ErrorOr<void> validate_emoji(StringView emoji_resource_path, EmojiData& emoji_data)
  152. {
  153. TRY(Core::Directory::for_each_entry(emoji_resource_path, Core::DirIterator::SkipDots, [&](auto& entry, auto&) -> ErrorOr<IterationDecision> {
  154. auto lexical_path = LexicalPath(entry.name);
  155. if (lexical_path.extension() != "png")
  156. return IterationDecision::Continue;
  157. auto title = lexical_path.title();
  158. if (!title.starts_with("U+"sv))
  159. return IterationDecision::Continue;
  160. Vector<u32> code_points;
  161. TRY(title.for_each_split_view('_', SplitBehavior::Nothing, [&](auto segment) -> ErrorOr<void> {
  162. auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(segment.substring_view(2));
  163. VERIFY(code_point.has_value());
  164. TRY(code_points.try_append(*code_point));
  165. return {};
  166. }));
  167. auto it = emoji_data.emojis.find_if([&](auto const& emoji) {
  168. return emoji.code_points == code_points;
  169. });
  170. if (it == emoji_data.emojis.end()) {
  171. warnln("\x1b[1;31mError!\x1b[0m Emoji data for \x1b[35m{}\x1b[0m not found. Please check emoji-test.txt and emoji-serenity.txt.", entry.name);
  172. return Error::from_errno(ENOENT);
  173. }
  174. if (!emoji_data.emoji_file_list.contains_slow(lexical_path.string().view())) {
  175. warnln("\x1b[1;31mError!\x1b[0m Emoji entry for \x1b[35m{}\x1b[0m not found. Please check emoji-file-list.txt.", lexical_path);
  176. return Error::from_errno(ENOENT);
  177. }
  178. return IterationDecision::Continue;
  179. }));
  180. return {};
  181. }
  182. static ErrorOr<void> generate_emoji_data_header(Core::InputBufferedFile& file, EmojiData const&)
  183. {
  184. StringBuilder builder;
  185. SourceGenerator generator { builder };
  186. TRY(file.write_until_depleted(generator.as_string_view().bytes()));
  187. return {};
  188. }
  189. static ErrorOr<void> generate_emoji_data_implementation(Core::InputBufferedFile& file, EmojiData const& emoji_data)
  190. {
  191. StringBuilder builder;
  192. SourceGenerator generator { builder };
  193. generator.set("string_index_type"sv, emoji_data.unique_strings.type_that_fits());
  194. generator.set("emojis_size"sv, ByteString::number(emoji_data.emojis.size()));
  195. generator.append(R"~~~(
  196. #include <AK/Array.h>
  197. #include <AK/BinarySearch.h>
  198. #include <AK/Span.h>
  199. #include <AK/StringView.h>
  200. #include <AK/Types.h>
  201. #include <LibUnicode/Emoji.h>
  202. #include <LibUnicode/EmojiData.h>
  203. namespace Unicode {
  204. )~~~");
  205. emoji_data.unique_strings.generate(generator);
  206. size_t total_code_point_count { 0 };
  207. for (auto const& emoji : emoji_data.emojis) {
  208. total_code_point_count += emoji.code_points.size();
  209. }
  210. generator.set("total_code_point_count", ByteString::number(total_code_point_count));
  211. generator.append(R"~~~(
  212. static constexpr Array<u32, @total_code_point_count@> s_emoji_code_points { {)~~~");
  213. bool first = true;
  214. for (auto const& emoji : emoji_data.emojis) {
  215. for (auto code_point : emoji.code_points) {
  216. generator.append(first ? " "sv : ", "sv);
  217. generator.append(ByteString::formatted("{:#x}", code_point));
  218. first = false;
  219. }
  220. }
  221. generator.append(" } };"sv);
  222. generator.append(R"~~~(
  223. struct EmojiData {
  224. Emoji to_unicode_emoji() const
  225. {
  226. Emoji emoji {};
  227. emoji.name = decode_string(name);
  228. if (image_path != 0)
  229. emoji.image_path = decode_string(image_path);
  230. emoji.group = static_cast<EmojiGroup>(group);
  231. emoji.display_order = display_order;
  232. emoji.code_points = code_points();
  233. return emoji;
  234. }
  235. constexpr ReadonlySpan<u32> code_points() const
  236. {
  237. return ReadonlySpan<u32>(s_emoji_code_points.data() + code_point_start, code_point_count);
  238. }
  239. @string_index_type@ name { 0 };
  240. @string_index_type@ image_path { 0 };
  241. u8 group { 0 };
  242. u32 display_order { 0 };
  243. size_t code_point_start { 0 };
  244. size_t code_point_count { 0 };
  245. };
  246. )~~~");
  247. generator.append(R"~~~(
  248. static constexpr Array<EmojiData, @emojis_size@> s_emojis { {)~~~");
  249. for (auto const& emoji : emoji_data.emojis) {
  250. generator.set("name"sv, ByteString::number(emoji.name));
  251. generator.set("image_path"sv, ByteString::number(emoji.image_path.value_or(0)));
  252. generator.set("group"sv, ByteString::number(to_underlying(emoji.group)));
  253. generator.set("display_order"sv, ByteString::number(emoji.display_order));
  254. generator.set("code_point_start"sv, ByteString::number(emoji.code_point_array_index));
  255. generator.set("code_point_count"sv, ByteString::number(emoji.code_points.size()));
  256. generator.append(R"~~~(
  257. { @name@, @image_path@, @group@, @display_order@, @code_point_start@, @code_point_count@ },)~~~");
  258. }
  259. generator.append(R"~~~(
  260. } };
  261. struct EmojiCodePointComparator {
  262. constexpr int operator()(ReadonlySpan<u32> code_points, EmojiData const& emoji)
  263. {
  264. auto emoji_code_points = emoji.code_points();
  265. if (code_points.size() != emoji_code_points.size())
  266. return static_cast<int>(code_points.size()) - static_cast<int>(emoji_code_points.size());
  267. for (size_t i = 0; i < code_points.size(); ++i) {
  268. if (code_points[i] != emoji_code_points[i])
  269. return static_cast<int>(code_points[i]) - static_cast<int>(emoji_code_points[i]);
  270. }
  271. return 0;
  272. }
  273. };
  274. Optional<Emoji> find_emoji_for_code_points(ReadonlySpan<u32> code_points)
  275. {
  276. if (auto const* emoji = binary_search(s_emojis, code_points, nullptr, EmojiCodePointComparator {}))
  277. return emoji->to_unicode_emoji();
  278. return {};
  279. }
  280. }
  281. )~~~");
  282. TRY(file.write_until_depleted(generator.as_string_view().bytes()));
  283. return {};
  284. }
  285. static ErrorOr<void> generate_emoji_installation(Core::InputBufferedFile& file, EmojiData const& emoji_data)
  286. {
  287. StringBuilder builder;
  288. SourceGenerator generator { builder };
  289. auto current_group = Unicode::EmojiGroup::Unknown;
  290. StringView current_subgroup;
  291. for (auto const& emoji : emoji_data.emojis) {
  292. if (!emoji.image_path.has_value())
  293. continue;
  294. if (emoji.group == Unicode::EmojiGroup::SerenityOS)
  295. continue; // SerenityOS emojis are in emoji-serenity.txt
  296. if (current_group != emoji.group) {
  297. if (!builder.is_empty())
  298. generator.append("\n"sv);
  299. generator.set("group"sv, Unicode::emoji_group_to_string(emoji.group));
  300. generator.append("# group: @group@\n");
  301. current_group = emoji.group;
  302. }
  303. if (current_subgroup != emoji.subgroup) {
  304. generator.set("subgroup"sv, emoji.subgroup);
  305. generator.append("\n# subgroup: @subgroup@\n");
  306. current_subgroup = emoji.subgroup;
  307. }
  308. generator.set("emoji"sv, emoji.encoded_code_points);
  309. generator.set("name"sv, emoji_data.unique_strings.get(emoji.name));
  310. generator.set("status"sv, emoji.status);
  311. generator.append("@emoji@"sv);
  312. generator.append(" - "sv);
  313. generator.append(ByteString::join(" "sv, emoji.code_points, "U+{:X}"sv));
  314. generator.append(" @name@ (@status@)\n"sv);
  315. }
  316. TRY(file.write_until_depleted(generator.as_string_view().bytes()));
  317. return {};
  318. }
  319. ErrorOr<int> serenity_main(Main::Arguments arguments)
  320. {
  321. StringView generated_header_path;
  322. StringView generated_implementation_path;
  323. StringView generated_installation_path;
  324. StringView emoji_test_path;
  325. StringView emoji_serenity_path;
  326. StringView emoji_file_list_path;
  327. StringView emoji_resource_path;
  328. Core::ArgsParser args_parser;
  329. args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path");
  330. args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
  331. args_parser.add_option(generated_installation_path, "Path to the emoji.txt file to generate", "generated-installation-path", 'i', "generated-installation-path");
  332. args_parser.add_option(emoji_test_path, "Path to emoji-test.txt file", "emoji-test-path", 'e', "emoji-test-path");
  333. args_parser.add_option(emoji_serenity_path, "Path to emoji-serenity.txt file", "emoji-serenity-path", 's', "emoji-serenity-path");
  334. args_parser.add_option(emoji_file_list_path, "Path to the emoji-file-list.txt file", "emoji-file-list-path", 'f', "emoji-file-list-path");
  335. args_parser.add_option(emoji_resource_path, "Path to the /res/emoji directory", "emoji-resource-path", 'r', "emoji-resource-path");
  336. args_parser.parse(arguments);
  337. VERIFY(!emoji_resource_path.is_empty() && FileSystem::exists(emoji_resource_path));
  338. auto emoji_test_file = TRY(open_file(emoji_test_path, Core::File::OpenMode::Read));
  339. EmojiData emoji_data {};
  340. TRY(parse_emoji_test_data(*emoji_test_file, emoji_data));
  341. if (!emoji_serenity_path.is_empty() && !emoji_file_list_path.is_empty()) {
  342. auto emoji_serenity_file = TRY(open_file(emoji_serenity_path, Core::File::OpenMode::Read));
  343. TRY(parse_emoji_serenity_data(*emoji_serenity_file, emoji_data));
  344. auto emoji_file_list_file = TRY(open_file(emoji_file_list_path, Core::File::OpenMode::Read));
  345. TRY(parse_emoji_file_list(*emoji_file_list_file, emoji_data));
  346. TRY(validate_emoji(emoji_resource_path, emoji_data));
  347. }
  348. for (auto& emoji : emoji_data.emojis)
  349. set_image_path_for_emoji(emoji_resource_path, emoji_data, emoji);
  350. if (!generated_installation_path.is_empty()) {
  351. TRY(Core::Directory::create(LexicalPath { generated_installation_path }.parent(), Core::Directory::CreateDirectories::Yes));
  352. auto generated_installation_file = TRY(open_file(generated_installation_path, Core::File::OpenMode::Write));
  353. TRY(generate_emoji_installation(*generated_installation_file, emoji_data));
  354. }
  355. if (!generated_header_path.is_empty()) {
  356. auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
  357. TRY(generate_emoji_data_header(*generated_header_file, emoji_data));
  358. }
  359. if (!generated_implementation_path.is_empty()) {
  360. quick_sort(emoji_data.emojis, [](auto const& lhs, auto const& rhs) {
  361. if (lhs.code_points.size() != rhs.code_points.size())
  362. return lhs.code_points.size() < rhs.code_points.size();
  363. for (size_t i = 0; i < lhs.code_points.size(); ++i) {
  364. if (lhs.code_points[i] < rhs.code_points[i])
  365. return true;
  366. if (lhs.code_points[i] > rhs.code_points[i])
  367. return false;
  368. }
  369. return false;
  370. });
  371. size_t code_point_array_index { 0 };
  372. for (auto& emoji : emoji_data.emojis) {
  373. emoji.code_point_array_index = code_point_array_index;
  374. code_point_array_index += emoji.code_points.size();
  375. }
  376. auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
  377. TRY(generate_emoji_data_implementation(*generated_implementation_file, emoji_data));
  378. }
  379. return 0;
  380. }