Deflate.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /*
  2. * Copyright (c) 2020, the SerenityOS developers
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <AK/Assertions.h>
  27. #include <AK/BinarySearch.h>
  28. #include <AK/FixedArray.h>
  29. #include <AK/LogStream.h>
  30. #include <AK/MemoryStream.h>
  31. #include <LibCompress/Deflate.h>
  32. namespace Compress {
  33. const DeflateDecompressor::CanonicalCode& DeflateDecompressor::CanonicalCode::fixed_literal_codes()
  34. {
  35. static CanonicalCode code;
  36. static bool initialized = false;
  37. if (initialized)
  38. return code;
  39. FixedArray<u8> data { 288 };
  40. data.bytes().slice(0, 144 - 0).fill(8);
  41. data.bytes().slice(144, 256 - 144).fill(9);
  42. data.bytes().slice(256, 280 - 256).fill(7);
  43. data.bytes().slice(280, 288 - 280).fill(8);
  44. code = CanonicalCode::from_bytes(data).value();
  45. initialized = true;
  46. return code;
  47. }
  48. const DeflateDecompressor::CanonicalCode& DeflateDecompressor::CanonicalCode::fixed_distance_codes()
  49. {
  50. static CanonicalCode code;
  51. static bool initialized = false;
  52. if (initialized)
  53. return code;
  54. FixedArray<u8> data { 32 };
  55. data.bytes().fill(5);
  56. code = CanonicalCode::from_bytes(data).value();
  57. initialized = true;
  58. return code;
  59. }
  60. Optional<DeflateDecompressor::CanonicalCode> DeflateDecompressor::CanonicalCode::from_bytes(ReadonlyBytes bytes)
  61. {
  62. // FIXME: I can't quite follow the algorithm here, but it seems to work.
  63. CanonicalCode code;
  64. auto next_code = 0;
  65. for (size_t code_length = 1; code_length <= 15; ++code_length) {
  66. next_code <<= 1;
  67. auto start_bit = 1 << code_length;
  68. for (size_t symbol = 0; symbol < bytes.size(); ++symbol) {
  69. if (bytes[symbol] != code_length)
  70. continue;
  71. if (next_code > start_bit)
  72. return {};
  73. code.m_symbol_codes.append(start_bit | next_code);
  74. code.m_symbol_values.append(symbol);
  75. next_code++;
  76. }
  77. }
  78. if (next_code != (1 << 15)) {
  79. return {};
  80. }
  81. return code;
  82. }
  83. u32 DeflateDecompressor::CanonicalCode::read_symbol(InputBitStream& stream) const
  84. {
  85. u32 code_bits = 1;
  86. for (;;) {
  87. code_bits = code_bits << 1 | stream.read_bits(1);
  88. // FIXME: This seems really inefficent, this could be an index into an array instead.
  89. size_t index;
  90. if (AK::binary_search(m_symbol_codes.span(), code_bits, AK::integral_compare<u32>, &index))
  91. return m_symbol_values[index];
  92. }
  93. }
  94. DeflateDecompressor::CompressedBlock::CompressedBlock(DeflateDecompressor& decompressor, CanonicalCode literal_codes, Optional<CanonicalCode> distance_codes)
  95. : m_decompressor(decompressor)
  96. , m_literal_codes(literal_codes)
  97. , m_distance_codes(distance_codes)
  98. {
  99. }
  100. bool DeflateDecompressor::CompressedBlock::try_read_more()
  101. {
  102. if (m_eof == true)
  103. return false;
  104. const auto symbol = m_literal_codes.read_symbol(m_decompressor.m_input_stream);
  105. if (symbol < 256) {
  106. m_decompressor.m_output_stream << static_cast<u8>(symbol);
  107. return true;
  108. } else if (symbol == 256) {
  109. m_eof = true;
  110. return false;
  111. } else {
  112. // FIXME: This assertion depends on user input.
  113. ASSERT(m_distance_codes.has_value());
  114. const auto run_length = m_decompressor.decode_run_length(symbol);
  115. const auto distance = m_decompressor.decode_distance(m_distance_codes.value().read_symbol(m_decompressor.m_input_stream));
  116. size_t nread = 0;
  117. while (nread < run_length) {
  118. const auto nreserved = min(run_length - nread, m_decompressor.m_output_stream.remaining_contigous_space());
  119. auto bytes = m_decompressor.m_output_stream.reserve_contigous_space(nreserved);
  120. nread += m_decompressor.m_output_stream.read(bytes, distance + nread + nreserved);
  121. }
  122. return true;
  123. }
  124. }
  125. DeflateDecompressor::UncompressedBlock::UncompressedBlock(DeflateDecompressor& decompressor, size_t length)
  126. : m_decompressor(decompressor)
  127. , m_bytes_remaining(length)
  128. {
  129. }
  130. bool DeflateDecompressor::UncompressedBlock::try_read_more()
  131. {
  132. if (m_bytes_remaining == 0)
  133. return false;
  134. const auto nread = min(m_bytes_remaining, m_decompressor.m_output_stream.remaining_contigous_space());
  135. m_bytes_remaining -= nread;
  136. m_decompressor.m_input_stream >> m_decompressor.m_output_stream.reserve_contigous_space(nread);
  137. return true;
  138. }
  139. DeflateDecompressor::DeflateDecompressor(InputStream& stream)
  140. : m_input_stream(stream)
  141. {
  142. }
  143. DeflateDecompressor::~DeflateDecompressor()
  144. {
  145. if (m_state == State::ReadingCompressedBlock)
  146. m_compressed_block.~CompressedBlock();
  147. if (m_state == State::ReadingUncompressedBlock)
  148. m_uncompressed_block.~UncompressedBlock();
  149. }
  150. size_t DeflateDecompressor::read(Bytes bytes)
  151. {
  152. // FIXME: There are surely a ton of bugs because we don't check for read errors
  153. // very often.
  154. if (m_state == State::Idle) {
  155. if (m_read_final_bock)
  156. return 0;
  157. m_read_final_bock = m_input_stream.read_bit();
  158. const auto block_type = m_input_stream.read_bits(2);
  159. if (block_type == 0b00) {
  160. m_input_stream.align_to_byte_boundary();
  161. LittleEndian<u16> length, negated_length;
  162. m_input_stream >> length >> negated_length;
  163. if ((length ^ 0xffff) != negated_length) {
  164. set_fatal_error();
  165. return 0;
  166. }
  167. m_state = State::ReadingUncompressedBlock;
  168. new (&m_uncompressed_block) UncompressedBlock(*this, length);
  169. return read(bytes);
  170. }
  171. if (block_type == 0b01) {
  172. m_state = State::ReadingCompressedBlock;
  173. new (&m_compressed_block) CompressedBlock(*this, CanonicalCode::fixed_literal_codes(), CanonicalCode::fixed_distance_codes());
  174. return read(bytes);
  175. }
  176. if (block_type == 0b10) {
  177. CanonicalCode literal_codes;
  178. Optional<CanonicalCode> distance_codes;
  179. decode_codes(literal_codes, distance_codes);
  180. m_state = State::ReadingCompressedBlock;
  181. new (&m_compressed_block) CompressedBlock(*this, literal_codes, distance_codes);
  182. return read(bytes);
  183. }
  184. set_fatal_error();
  185. return 0;
  186. }
  187. if (m_state == State::ReadingCompressedBlock) {
  188. auto nread = m_output_stream.read(bytes);
  189. while (nread < bytes.size() && m_compressed_block.try_read_more()) {
  190. nread += m_output_stream.read(bytes.slice(nread));
  191. }
  192. if (nread == bytes.size())
  193. return nread;
  194. m_compressed_block.~CompressedBlock();
  195. m_state = State::Idle;
  196. return nread + read(bytes.slice(nread));
  197. }
  198. if (m_state == State::ReadingUncompressedBlock) {
  199. auto nread = m_output_stream.read(bytes);
  200. while (nread < bytes.size() && m_uncompressed_block.try_read_more()) {
  201. nread += m_output_stream.read(bytes.slice(nread));
  202. }
  203. if (nread == bytes.size())
  204. return nread;
  205. m_uncompressed_block.~UncompressedBlock();
  206. m_state = State::Idle;
  207. return nread + read(bytes.slice(nread));
  208. }
  209. ASSERT_NOT_REACHED();
  210. }
  211. bool DeflateDecompressor::read_or_error(Bytes bytes)
  212. {
  213. if (read(bytes) < bytes.size()) {
  214. set_fatal_error();
  215. return false;
  216. }
  217. return true;
  218. }
  219. bool DeflateDecompressor::discard_or_error(size_t count)
  220. {
  221. u8 buffer[4096];
  222. size_t ndiscarded = 0;
  223. while (ndiscarded < count) {
  224. if (eof()) {
  225. set_fatal_error();
  226. return false;
  227. }
  228. ndiscarded += read({ buffer, min<size_t>(count - ndiscarded, 4096) });
  229. }
  230. return true;
  231. }
  232. bool DeflateDecompressor::eof() const { return m_state == State::Idle && m_read_final_bock; }
  233. ByteBuffer DeflateDecompressor::decompress_all(ReadonlyBytes bytes)
  234. {
  235. InputMemoryStream memory_stream { bytes };
  236. InputBitStream bit_stream { memory_stream };
  237. DeflateDecompressor deflate_stream { bit_stream };
  238. auto buffer = ByteBuffer::create_uninitialized(4096);
  239. size_t nread = 0;
  240. while (!deflate_stream.eof()) {
  241. nread += deflate_stream.read(buffer.bytes().slice(nread));
  242. if (buffer.size() - nread < 4096)
  243. buffer.grow(buffer.size() + 4096);
  244. }
  245. buffer.trim(nread);
  246. return buffer;
  247. }
  248. u32 DeflateDecompressor::decode_run_length(u32 symbol)
  249. {
  250. // FIXME: I can't quite follow the algorithm here, but it seems to work.
  251. if (symbol <= 264)
  252. return symbol - 254;
  253. if (symbol <= 284) {
  254. auto extra_bits = (symbol - 261) / 4;
  255. return (((symbol - 265) % 4 + 4) << extra_bits) + 3 + m_input_stream.read_bits(extra_bits);
  256. }
  257. if (symbol == 285)
  258. return 258;
  259. ASSERT_NOT_REACHED();
  260. }
  261. u32 DeflateDecompressor::decode_distance(u32 symbol)
  262. {
  263. // FIXME: I can't quite follow the algorithm here, but it seems to work.
  264. if (symbol <= 3)
  265. return symbol + 1;
  266. if (symbol <= 29) {
  267. auto extra_bits = (symbol / 2) - 1;
  268. return ((symbol % 2 + 2) << extra_bits) + 1 + m_input_stream.read_bits(extra_bits);
  269. }
  270. ASSERT_NOT_REACHED();
  271. }
  272. void DeflateDecompressor::decode_codes(CanonicalCode& literal_code, Optional<CanonicalCode>& distance_code)
  273. {
  274. auto literal_code_count = m_input_stream.read_bits(5) + 257;
  275. auto distance_code_count = m_input_stream.read_bits(5) + 1;
  276. auto code_length_count = m_input_stream.read_bits(4) + 4;
  277. // First we have to extract the code lengths of the code that was used to encode the code lengths of
  278. // the code that was used to encode the block.
  279. u8 code_lengths_code_lengths[19] = { 0 };
  280. for (size_t i = 0; i < code_length_count; ++i) {
  281. static const size_t indices[] { 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 };
  282. code_lengths_code_lengths[indices[i]] = m_input_stream.read_bits(3);
  283. }
  284. // Now we can extract the code that was used to encode the code lengths of the code that was used to
  285. // encode the block.
  286. auto code_length_code_result = CanonicalCode::from_bytes({ code_lengths_code_lengths, sizeof(code_lengths_code_lengths) });
  287. if (!code_length_code_result.has_value()) {
  288. set_fatal_error();
  289. return;
  290. }
  291. const auto code_length_code = code_length_code_result.value();
  292. // Next we extract the code lengths of the code that was used to encode the block.
  293. Vector<u8> code_lengths;
  294. while (code_lengths.size() < literal_code_count + distance_code_count) {
  295. auto symbol = code_length_code.read_symbol(m_input_stream);
  296. if (symbol <= 15) {
  297. code_lengths.append(static_cast<u8>(symbol));
  298. continue;
  299. } else if (symbol == 17) {
  300. auto nrepeat = 3 + m_input_stream.read_bits(3);
  301. for (size_t j = 0; j < nrepeat; ++j)
  302. code_lengths.append(0);
  303. continue;
  304. } else if (symbol == 18) {
  305. auto nrepeat = 11 + m_input_stream.read_bits(7);
  306. for (size_t j = 0; j < nrepeat; ++j)
  307. code_lengths.append(0);
  308. continue;
  309. } else {
  310. ASSERT(symbol == 16);
  311. if (code_lengths.is_empty()) {
  312. set_fatal_error();
  313. return;
  314. }
  315. auto nrepeat = 3 + m_input_stream.read_bits(2);
  316. for (size_t j = 0; j < nrepeat; ++j)
  317. code_lengths.append(code_lengths.last());
  318. }
  319. }
  320. if (code_lengths.size() != literal_code_count + distance_code_count) {
  321. set_fatal_error();
  322. return;
  323. }
  324. // Now we extract the code that was used to encode literals and lengths in the block.
  325. auto literal_code_result = CanonicalCode::from_bytes(code_lengths.span().trim(literal_code_count));
  326. if (!literal_code_result.has_value()) {
  327. set_fatal_error();
  328. return;
  329. }
  330. literal_code = literal_code_result.value();
  331. // Now we extract the code that was used to encode distances in the block.
  332. if (distance_code_count == 1) {
  333. auto length = code_lengths[literal_code_count];
  334. if (length == 0) {
  335. return;
  336. } else if (length != 1) {
  337. set_fatal_error();
  338. return;
  339. }
  340. }
  341. auto distance_code_result = CanonicalCode::from_bytes(code_lengths.span().slice(literal_code_count));
  342. if (!distance_code_result.has_value()) {
  343. set_fatal_error();
  344. return;
  345. }
  346. distance_code = distance_code_result.value();
  347. }
  348. }