Encoder.cpp 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. /*
  2. * Copyright (c) 2024, Ben Jilks <benjyjilks@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BinarySearch.h>
  7. #include <AK/Error.h>
  8. #include <AK/Utf8View.h>
  9. #include <LibTextCodec/Decoder.h>
  10. #include <LibTextCodec/Encoder.h>
  11. #include <LibTextCodec/LookupTables.h>
  12. namespace TextCodec {
  13. namespace {
  14. UTF8Encoder s_utf8_encoder;
  15. GB18030Encoder s_gb18030_encoder;
  16. GB18030Encoder s_gbk_encoder(GB18030Encoder::IsGBK::Yes);
  17. Big5Encoder s_big5_encoder;
  18. EUCJPEncoder s_euc_jp_encoder;
  19. EUCKREncoder s_euc_kr_encoder;
  20. }
  21. Optional<Encoder&> encoder_for_exact_name(StringView encoding)
  22. {
  23. if (encoding.equals_ignoring_ascii_case("utf-8"sv))
  24. return s_utf8_encoder;
  25. if (encoding.equals_ignoring_ascii_case("big5"sv))
  26. return s_big5_encoder;
  27. if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
  28. return s_euc_jp_encoder;
  29. if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
  30. return s_euc_kr_encoder;
  31. if (encoding.equals_ignoring_ascii_case("gb18030"sv))
  32. return s_gb18030_encoder;
  33. if (encoding.equals_ignoring_ascii_case("gbk"sv))
  34. return s_gbk_encoder;
  35. dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
  36. return {};
  37. }
  38. Optional<Encoder&> encoder_for(StringView label)
  39. {
  40. auto encoding = get_standardized_encoding(label);
  41. return encoding.has_value() ? encoder_for_exact_name(encoding.value()) : Optional<Encoder&> {};
  42. }
  43. // https://encoding.spec.whatwg.org/#utf-8-encoder
  44. ErrorOr<void> UTF8Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
  45. {
  46. ReadonlyBytes bytes { input.bytes(), input.byte_length() };
  47. for (auto byte : bytes)
  48. TRY(on_byte(byte));
  49. return {};
  50. }
  51. // https://encoding.spec.whatwg.org/#euc-jp-encoder
  52. ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
  53. {
  54. for (auto item : input) {
  55. // 1. If code point is end-of-queue, return finished.
  56. // 2. If code point is an ASCII code point, return a byte whose value is code point.
  57. if (is_ascii(item)) {
  58. TRY(on_byte(static_cast<u8>(item)));
  59. continue;
  60. }
  61. // 3. If code point is U+00A5, return byte 0x5C.
  62. if (item == 0x00A5) {
  63. TRY(on_byte(static_cast<u8>(0x5C)));
  64. continue;
  65. }
  66. // 4. If code point is U+203E, return byte 0x7E.
  67. if (item == 0x203E) {
  68. TRY(on_byte(static_cast<u8>(0x7E)));
  69. continue;
  70. }
  71. // 5. If code point is in the range U+FF61 to U+FF9F, inclusive, return two bytes whose values are 0x8E and code point − 0xFF61 + 0xA1.
  72. if (item >= 0xFF61 && item <= 0xFF9F) {
  73. TRY(on_byte(0x8E));
  74. TRY(on_byte(static_cast<u8>(item - 0xFF61 + 0xA1)));
  75. continue;
  76. }
  77. // 6. If code point is U+2212, set it to U+FF0D.
  78. if (item == 0x2212)
  79. item = 0xFF0D;
  80. // 7. Let pointer be the index pointer for code point in index jis0208.
  81. auto pointer = code_point_jis0208_index(item);
  82. // 8. If pointer is null, return error with code point.
  83. if (!pointer.has_value()) {
  84. // TODO: Report error.
  85. continue;
  86. }
  87. // 9. Let lead be pointer / 94 + 0xA1.
  88. auto lead = *pointer / 94 + 0xA1;
  89. // 10. Let trail be pointer % 94 + 0xA1.
  90. auto trail = *pointer % 94 + 0xA1;
  91. // 11. Return two bytes whose values are lead and trail.
  92. TRY(on_byte(static_cast<u8>(lead)));
  93. TRY(on_byte(static_cast<u8>(trail)));
  94. }
  95. return {};
  96. }
  97. // https://encoding.spec.whatwg.org/#euc-kr-encoder
  98. ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
  99. {
  100. for (u32 item : input) {
  101. // 1. If code point is end-of-queue, return finished.
  102. // 2. If code point is an ASCII code point, return a byte whose value is code point.
  103. if (is_ascii(item)) {
  104. TRY(on_byte(static_cast<u8>(item)));
  105. continue;
  106. }
  107. // 3. Let pointer be the index pointer for code point in index EUC-KR.
  108. auto pointer = code_point_euc_kr_index(item);
  109. // 4. If pointer is null, return error with code point.
  110. if (!pointer.has_value()) {
  111. // TODO: Report error.
  112. continue;
  113. }
  114. // 5. Let lead be pointer / 190 + 0x81.
  115. auto lead = *pointer / 190 + 0x81;
  116. // 6. Let trail be pointer % 190 + 0x41.
  117. auto trail = *pointer % 190 + 0x41;
  118. // 7. Return two bytes whose values are lead and trail.
  119. TRY(on_byte(static_cast<u8>(lead)));
  120. TRY(on_byte(static_cast<u8>(trail)));
  121. }
  122. return {};
  123. }
  124. // https://encoding.spec.whatwg.org/#index-big5-pointer
  125. static Optional<u32> index_big5_pointer(u32 code_point)
  126. {
  127. // 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157.
  128. auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer;
  129. // 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer
  130. // corresponding to code point in index.
  131. if (Array<u32, 6> { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) {
  132. for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) {
  133. if (s_big5_index[i] == code_point) {
  134. return s_big5_index_first_pointer + i;
  135. }
  136. }
  137. return {};
  138. }
  139. // 3. Return the index pointer for code point in index.
  140. for (u32 i = start_index; i < s_big5_index.size(); ++i) {
  141. if (s_big5_index[i] == code_point) {
  142. return s_big5_index_first_pointer + i;
  143. }
  144. }
  145. return {};
  146. }
  147. // https://encoding.spec.whatwg.org/#big5-encoder
  148. ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
  149. {
  150. for (u32 item : input) {
  151. // 1. If code point is end-of-queue, return finished.
  152. // 2. If code point is an ASCII code point, return a byte whose value is code point.
  153. if (is_ascii(item)) {
  154. TRY(on_byte(static_cast<u8>(item)));
  155. continue;
  156. }
  157. // 3. Let pointer be the index Big5 pointer for code point.
  158. auto pointer = index_big5_pointer(item);
  159. // 4. If pointer is null, return error with code point.
  160. if (!pointer.has_value()) {
  161. // TODO: Report error.
  162. continue;
  163. }
  164. // 5. Let lead be pointer / 157 + 0x81.
  165. auto lead = *pointer / 157 + 0x81;
  166. // 6. Let trail be pointer % 157.
  167. auto trail = *pointer % 157;
  168. // 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62.
  169. auto offset = 0x62;
  170. if (trail < 0x3f)
  171. offset = 0x40;
  172. // 8. Return two bytes whose values are lead and trail + offset.
  173. TRY(on_byte(static_cast<u8>(lead)));
  174. TRY(on_byte(static_cast<u8>(trail + offset)));
  175. }
  176. return {};
  177. }
  178. // https://encoding.spec.whatwg.org/#index-gb18030-ranges-pointer
  179. static u32 index_gb18030_ranges_pointer(u32 code_point)
  180. {
  181. // 1. If code point is U+E7C7, return pointer 7457.
  182. if (code_point == 0xe7c7)
  183. return 7457;
  184. // 2. Let offset be the last code point in index gb18030 ranges that is less than
  185. // or equal to code point and let pointer offset be its corresponding pointer.
  186. size_t last_index;
  187. binary_search(s_gb18030_ranges, code_point, &last_index, [](auto const code_point, auto const& entry) {
  188. return code_point - entry.code_point;
  189. });
  190. auto offset = s_gb18030_ranges[last_index].code_point;
  191. auto pointer_offset = s_gb18030_ranges[last_index].pointer;
  192. // 3. Return a pointer whose value is pointer offset + code point − offset.
  193. return pointer_offset + code_point - offset;
  194. }
  195. GB18030Encoder::GB18030Encoder(IsGBK is_gbk)
  196. : m_is_gbk(is_gbk)
  197. {
  198. }
  199. // https://encoding.spec.whatwg.org/#gb18030-encoder
  200. ErrorOr<void> GB18030Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
  201. {
  202. bool gbk = (m_is_gbk == IsGBK::Yes);
  203. for (u32 item : input) {
  204. // 1. If code point is end-of-queue, return finished.
  205. // 2. If code point is an ASCII code point, return a byte whose value is code point.
  206. if (is_ascii(item)) {
  207. TRY(on_byte(static_cast<u8>(item)));
  208. continue;
  209. }
  210. // 3. If code point is U+E5E5, return error with code point.
  211. if (item == 0xE5E5) {
  212. // TODO: Report error.
  213. continue;
  214. }
  215. // 4. If is GBK is true and code point is U+20AC, return byte 0x80.
  216. if (gbk && item == 0x20AC) {
  217. TRY(on_byte(0x80));
  218. continue;
  219. }
  220. // 5. Let pointer be the index pointer for code point in index gb18030.
  221. auto pointer = code_point_gb18030_index(item);
  222. // 6. If pointer is non-null, then:
  223. if (pointer.has_value()) {
  224. // 1. Let lead be pointer / 190 + 0x81.
  225. auto lead = *pointer / 190 + 0x81;
  226. // 2. Let trail be pointer % 190.
  227. auto trail = *pointer % 190;
  228. // 3. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x41.
  229. auto offset = 0x41;
  230. if (trail < 0x3f)
  231. offset = 0x40;
  232. // 4. Return two bytes whose values are lead and trail + offset.
  233. TRY(on_byte(static_cast<u8>(lead)));
  234. TRY(on_byte(static_cast<u8>(trail + offset)));
  235. continue;
  236. }
  237. // 7. If is GBK is true, return error with code point.
  238. if (gbk) {
  239. // TODO: Report error.
  240. continue;
  241. }
  242. // 8. Set pointer to the index gb18030 ranges pointer for code point.
  243. pointer = index_gb18030_ranges_pointer(item);
  244. // 9. Let byte1 be pointer / (10 × 126 × 10).
  245. auto byte1 = *pointer / (10 * 126 * 10);
  246. // 10. Set pointer to pointer % (10 × 126 × 10).
  247. pointer = *pointer % (10 * 126 * 10);
  248. // 11. Let byte2 be pointer / (10 × 126).
  249. auto byte2 = *pointer / (10 * 126);
  250. // 12. Set pointer to pointer % (10 × 126).
  251. pointer = *pointer % (10 * 126);
  252. // 13. Let byte3 be pointer / 10.
  253. auto byte3 = *pointer / 10;
  254. // 14. Let byte4 be pointer % 10.
  255. auto byte4 = *pointer % 10;
  256. // 15. Return four bytes whose values are byte1 + 0x81, byte2 + 0x30, byte3 + 0x81, byte4 + 0x30.
  257. TRY(on_byte(static_cast<u8>(byte1 + 0x81)));
  258. TRY(on_byte(static_cast<u8>(byte2 + 0x30)));
  259. TRY(on_byte(static_cast<u8>(byte3 + 0x81)));
  260. TRY(on_byte(static_cast<u8>(byte4 + 0x30)));
  261. }
  262. return {};
  263. }
  264. }