BrotliDictionary.cpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. /*
  2. * Copyright (c) 2022, Michiel Visser <opensource@webmichiel.nl>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Types.h>
  7. #include <LibCompress/BrotliDictionary.h>
  8. // Include the 119.9 KiB of dictionary data from a binary file
  9. extern u8 const brotli_dictionary_data[];
  10. #if defined(AK_OS_MACOS)
  11. asm(".const_data\n"
  12. ".globl _brotli_dictionary_data\n"
  13. "_brotli_dictionary_data:\n");
  14. #elif defined(AK_OS_EMSCRIPTEN)
  15. asm(".section .data, \"\",@\n"
  16. ".global brotli_dictionary_data\n"
  17. "brotli_dictionary_data:\n");
  18. #else
  19. asm(".section .rodata\n"
  20. ".global brotli_dictionary_data\n"
  21. "brotli_dictionary_data:\n");
  22. #endif
  23. asm(".incbin \"" __FILE__ ".dict.bin\"\n"
  24. #if (!defined(AK_OS_WINDOWS) && !defined(AK_OS_EMSCRIPTEN))
  25. ".previous\n");
  26. #else
  27. );
  28. #endif
  29. namespace Compress {
  30. static size_t const bits_by_length[25] {
  31. 0, 0, 0, 0, 10, 10, 11, 11, 10, 10, 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, 7, 6, 6, 5, 5
  32. };
  33. static size_t const offset_by_length[25] {
  34. 0, 0, 0, 0, 0, 4096, 9216, 21504, 35840, 44032, 53248, 63488, 74752, 87040, 93696, 100864,
  35. 104704, 106752, 108928, 113536, 115968, 118528, 119872, 121280, 122016
  36. };
  37. static int ferment(Bytes word, size_t pos)
  38. {
  39. if (word[pos] < 192) {
  40. if (word[pos] >= 97 && word[pos] <= 122) {
  41. word[pos] = word[pos] ^ 32;
  42. }
  43. return 1;
  44. } else if (word[pos] < 224) {
  45. if (pos + 1 < word.size()) {
  46. word[pos + 1] = word[pos + 1] ^ 32;
  47. }
  48. return 2;
  49. } else {
  50. if (pos + 2 < word.size()) {
  51. word[pos + 2] = word[pos + 2] ^ 5;
  52. }
  53. return 3;
  54. }
  55. }
  56. static void ferment_first(Bytes word)
  57. {
  58. if (word.size() > 0) {
  59. ferment(word, 0);
  60. }
  61. }
  62. [[maybe_unused]] static void ferment_all(Bytes word)
  63. {
  64. size_t i = 0;
  65. while (i < word.size()) {
  66. i += ferment(word, i);
  67. }
  68. }
  69. using BrotliDictionary::TransformationOperation::FermentAll;
  70. using BrotliDictionary::TransformationOperation::FermentFirst;
  71. using BrotliDictionary::TransformationOperation::Identity;
  72. using BrotliDictionary::TransformationOperation::OmitFirst;
  73. using BrotliDictionary::TransformationOperation::OmitLast;
  74. constexpr static BrotliDictionary::Transformation transformations[121] {
  75. // ID Prefix Transform Suffix
  76. // -- ------ --------- ------
  77. { ""sv, Identity, 0, ""sv }, // 0 "" Identity ""
  78. { ""sv, Identity, 0, " "sv }, // 1 "" Identity " "
  79. { " "sv, Identity, 0, " "sv }, // 2 " " Identity " "
  80. { ""sv, OmitFirst, 1, ""sv }, // 3 "" OmitFirst1 ""
  81. { ""sv, FermentFirst, 0, " "sv }, // 4 "" FermentFirst " "
  82. { ""sv, Identity, 0, " the "sv }, // 5 "" Identity " the "
  83. { " "sv, Identity, 0, ""sv }, // 6 " " Identity ""
  84. { "s "sv, Identity, 0, " "sv }, // 7 "s " Identity " "
  85. { ""sv, Identity, 0, " of "sv }, // 8 "" Identity " of "
  86. { ""sv, FermentFirst, 0, ""sv }, // 9 "" FermentFirst ""
  87. { ""sv, Identity, 0, " and "sv }, // 10 "" Identity " and "
  88. { ""sv, OmitFirst, 2, ""sv }, // 11 "" OmitFirst2 ""
  89. { ""sv, OmitLast, 1, ""sv }, // 12 "" OmitLast1 ""
  90. { ", "sv, Identity, 0, " "sv }, // 13 ", " Identity " "
  91. { ""sv, Identity, 0, ", "sv }, // 14 "" Identity ", "
  92. { " "sv, FermentFirst, 0, " "sv }, // 15 " " FermentFirst " "
  93. { ""sv, Identity, 0, " in "sv }, // 16 "" Identity " in "
  94. { ""sv, Identity, 0, " to "sv }, // 17 "" Identity " to "
  95. { "e "sv, Identity, 0, " "sv }, // 18 "e " Identity " "
  96. { ""sv, Identity, 0, "\""sv }, // 19 "" Identity "\""
  97. { ""sv, Identity, 0, "."sv }, // 20 "" Identity "."
  98. { ""sv, Identity, 0, "\">"sv }, // 21 "" Identity "\">"
  99. { ""sv, Identity, 0, "\n"sv }, // 22 "" Identity "\n"
  100. { ""sv, OmitLast, 3, ""sv }, // 23 "" OmitLast3 ""
  101. { ""sv, Identity, 0, "]"sv }, // 24 "" Identity "]"
  102. { ""sv, Identity, 0, " for "sv }, // 25 "" Identity " for "
  103. { ""sv, OmitFirst, 3, ""sv }, // 26 "" OmitFirst3 ""
  104. { ""sv, OmitLast, 2, ""sv }, // 27 "" OmitLast2 ""
  105. { ""sv, Identity, 0, " a "sv }, // 28 "" Identity " a "
  106. { ""sv, Identity, 0, " that "sv }, // 29 "" Identity " that "
  107. { " "sv, FermentFirst, 0, ""sv }, // 30 " " FermentFirst ""
  108. { ""sv, Identity, 0, ". "sv }, // 31 "" Identity ". "
  109. { "."sv, Identity, 0, ""sv }, // 32 "." Identity ""
  110. { " "sv, Identity, 0, ", "sv }, // 33 " " Identity ", "
  111. { ""sv, OmitFirst, 4, ""sv }, // 34 "" OmitFirst4 ""
  112. { ""sv, Identity, 0, " with "sv }, // 35 "" Identity " with "
  113. { ""sv, Identity, 0, "'"sv }, // 36 "" Identity "'"
  114. { ""sv, Identity, 0, " from "sv }, // 37 "" Identity " from "
  115. { ""sv, Identity, 0, " by "sv }, // 38 "" Identity " by "
  116. { ""sv, OmitFirst, 5, ""sv }, // 39 "" OmitFirst5 ""
  117. { ""sv, OmitFirst, 6, ""sv }, // 40 "" OmitFirst6 ""
  118. { " the "sv, Identity, 0, ""sv }, // 41 " the " Identity ""
  119. { ""sv, OmitLast, 4, ""sv }, // 42 "" OmitLast4 ""
  120. { ""sv, Identity, 0, ". The "sv }, // 43 "" Identity ". The "
  121. { ""sv, FermentAll, 0, ""sv }, // 44 "" FermentAll ""
  122. { ""sv, Identity, 0, " on "sv }, // 45 "" Identity " on "
  123. { ""sv, Identity, 0, " as "sv }, // 46 "" Identity " as "
  124. { ""sv, Identity, 0, " is "sv }, // 47 "" Identity " is "
  125. { ""sv, OmitLast, 7, ""sv }, // 48 "" OmitLast7 ""
  126. { ""sv, OmitLast, 1, "ing "sv }, // 49 "" OmitLast1 "ing "
  127. { ""sv, Identity, 0, "\n\t"sv }, // 50 "" Identity "\n\t"
  128. { ""sv, Identity, 0, ":"sv }, // 51 "" Identity ":"
  129. { " "sv, Identity, 0, ". "sv }, // 52 " " Identity ". "
  130. { ""sv, Identity, 0, "ed "sv }, // 53 "" Identity "ed "
  131. { ""sv, OmitFirst, 9, ""sv }, // 54 "" OmitFirst9 ""
  132. { ""sv, OmitFirst, 7, ""sv }, // 55 "" OmitFirst7 ""
  133. { ""sv, OmitLast, 6, ""sv }, // 56 "" OmitLast6 ""
  134. { ""sv, Identity, 0, "("sv }, // 57 "" Identity "("
  135. { ""sv, FermentFirst, 0, ", "sv }, // 58 "" FermentFirst ", "
  136. { ""sv, OmitLast, 8, ""sv }, // 59 "" OmitLast8 ""
  137. { ""sv, Identity, 0, " at "sv }, // 60 "" Identity " at "
  138. { ""sv, Identity, 0, "ly "sv }, // 61 "" Identity "ly "
  139. { " the "sv, Identity, 0, " of "sv }, // 62 " the " Identity " of "
  140. { ""sv, OmitLast, 5, ""sv }, // 63 "" OmitLast5 ""
  141. { ""sv, OmitLast, 9, ""sv }, // 64 "" OmitLast9 ""
  142. { " "sv, FermentFirst, 0, ", "sv }, // 65 " " FermentFirst ", "
  143. { ""sv, FermentFirst, 0, "\""sv }, // 66 "" FermentFirst "\""
  144. { "."sv, Identity, 0, "("sv }, // 67 "." Identity "("
  145. { ""sv, FermentAll, 0, " "sv }, // 68 "" FermentAll " "
  146. { ""sv, FermentFirst, 0, "\">"sv }, // 69 "" FermentFirst "\">"
  147. { ""sv, Identity, 0, "=\""sv }, // 70 "" Identity "=\""
  148. { " "sv, Identity, 0, "."sv }, // 71 " " Identity "."
  149. { ".com/"sv, Identity, 0, ""sv }, // 72 ".com/" Identity ""
  150. { " the "sv, Identity, 0, " of the "sv }, // 73 " the " Identity " of the "
  151. { ""sv, FermentFirst, 0, "'"sv }, // 74 "" FermentFirst "'"
  152. { ""sv, Identity, 0, ". This "sv }, // 75 "" Identity ". This "
  153. { ""sv, Identity, 0, ","sv }, // 76 "" Identity ","
  154. { "."sv, Identity, 0, " "sv }, // 77 "." Identity " "
  155. { ""sv, FermentFirst, 0, "("sv }, // 78 "" FermentFirst "("
  156. { ""sv, FermentFirst, 0, "."sv }, // 79 "" FermentFirst "."
  157. { ""sv, Identity, 0, " not "sv }, // 80 "" Identity " not "
  158. { " "sv, Identity, 0, "=\""sv }, // 81 " " Identity "=\""
  159. { ""sv, Identity, 0, "er "sv }, // 82 "" Identity "er "
  160. { " "sv, FermentAll, 0, " "sv }, // 83 " " FermentAll " "
  161. { ""sv, Identity, 0, "al "sv }, // 84 "" Identity "al "
  162. { " "sv, FermentAll, 0, ""sv }, // 85 " " FermentAll ""
  163. { ""sv, Identity, 0, "='"sv }, // 86 "" Identity "='"
  164. { ""sv, FermentAll, 0, "\""sv }, // 87 "" FermentAll "\""
  165. { ""sv, FermentFirst, 0, ". "sv }, // 88 "" FermentFirst ". "
  166. { " "sv, Identity, 0, "("sv }, // 89 " " Identity "("
  167. { ""sv, Identity, 0, "ful "sv }, // 90 "" Identity "ful "
  168. { " "sv, FermentFirst, 0, ". "sv }, // 91 " " FermentFirst ". "
  169. { ""sv, Identity, 0, "ive "sv }, // 92 "" Identity "ive "
  170. { ""sv, Identity, 0, "less "sv }, // 93 "" Identity "less "
  171. { ""sv, FermentAll, 0, "'"sv }, // 94 "" FermentAll "'"
  172. { ""sv, Identity, 0, "est "sv }, // 95 "" Identity "est "
  173. { " "sv, FermentFirst, 0, "."sv }, // 96 " " FermentFirst "."
  174. { ""sv, FermentAll, 0, "\">"sv }, // 97 "" FermentAll "\">"
  175. { " "sv, Identity, 0, "='"sv }, // 98 " " Identity "='"
  176. { ""sv, FermentFirst, 0, ","sv }, // 99 "" FermentFirst ","
  177. { ""sv, Identity, 0, "ize "sv }, // 100 "" Identity "ize "
  178. { ""sv, FermentAll, 0, "."sv }, // 101 "" FermentAll "."
  179. { "\xc2\xa0"sv, Identity, 0, ""sv }, // 102 "\xc2\xa0" Identity ""
  180. { " "sv, Identity, 0, ","sv }, // 103 " " Identity ","
  181. { ""sv, FermentFirst, 0, "=\""sv }, // 104 "" FermentFirst "=\""
  182. { ""sv, FermentAll, 0, "=\""sv }, // 105 "" FermentAll "=\""
  183. { ""sv, Identity, 0, "ous "sv }, // 106 "" Identity "ous "
  184. { ""sv, FermentAll, 0, ", "sv }, // 107 "" FermentAll ", "
  185. { ""sv, FermentFirst, 0, "='"sv }, // 108 "" FermentFirst "='"
  186. { " "sv, FermentFirst, 0, ","sv }, // 109 " " FermentFirst ","
  187. { " "sv, FermentAll, 0, "=\""sv }, // 110 " " FermentAll "=\""
  188. { " "sv, FermentAll, 0, ", "sv }, // 111 " " FermentAll ", "
  189. { ""sv, FermentAll, 0, ","sv }, // 112 "" FermentAll ","
  190. { ""sv, FermentAll, 0, "("sv }, // 113 "" FermentAll "("
  191. { ""sv, FermentAll, 0, ". "sv }, // 114 "" FermentAll ". "
  192. { " "sv, FermentAll, 0, "."sv }, // 115 " " FermentAll "."
  193. { ""sv, FermentAll, 0, "='"sv }, // 116 "" FermentAll "='"
  194. { " "sv, FermentAll, 0, ". "sv }, // 117 " " FermentAll ". "
  195. { " "sv, FermentFirst, 0, "=\""sv }, // 118 " " FermentFirst "=\""
  196. { " "sv, FermentAll, 0, "='"sv }, // 119 " " FermentAll "='"
  197. { " "sv, FermentFirst, 0, "='"sv }, // 120 " " FermentFirst "='"
  198. };
  199. ErrorOr<ByteBuffer> BrotliDictionary::lookup_word(size_t index, size_t length)
  200. {
  201. if (length < 4 || length > 24)
  202. return Error::from_string_literal("invalid dictionary lookup length");
  203. size_t word_index = index % (1 << bits_by_length[length]);
  204. ReadonlyBytes base_word { brotli_dictionary_data + offset_by_length[length] + (word_index * length), length };
  205. size_t transform_id = index >> bits_by_length[length];
  206. if (transform_id >= 121)
  207. return Error::from_string_literal("invalid dictionary transformation");
  208. auto transformation = transformations[transform_id];
  209. ByteBuffer bb;
  210. bb.append(transformation.prefix.bytes());
  211. size_t prefix_length = bb.size();
  212. switch (transformation.operation) {
  213. case TransformationOperation::Identity:
  214. bb.append(base_word);
  215. break;
  216. case TransformationOperation::FermentFirst:
  217. bb.append(base_word);
  218. ferment_first(bb.bytes().slice(prefix_length));
  219. break;
  220. case TransformationOperation::FermentAll:
  221. bb.append(base_word);
  222. ferment_all(bb.bytes().slice(prefix_length));
  223. break;
  224. case TransformationOperation::OmitFirst:
  225. if (transformation.operation_data < base_word.size())
  226. bb.append(base_word.slice(transformation.operation_data));
  227. break;
  228. case TransformationOperation::OmitLast:
  229. if (transformation.operation_data < base_word.size())
  230. bb.append(base_word.slice(0, base_word.size() - transformation.operation_data));
  231. break;
  232. }
  233. bb.append(transformation.suffix.bytes());
  234. return bb;
  235. }
  236. }