Parser.cpp 39 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257
  1. /*
  2. * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BitStream.h>
  7. #include <AK/MemoryStream.h>
  8. #include <AK/ScopeGuard.h>
  9. #include <AK/TypeCasts.h>
  10. #include <LibPDF/CommonNames.h>
  11. #include <LibPDF/Document.h>
  12. #include <LibPDF/Filter.h>
  13. #include <LibPDF/Parser.h>
  14. #include <LibTextCodec/Decoder.h>
  15. #include <ctype.h>
  16. namespace PDF {
  17. template<typename T, typename... Args>
  18. static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
  19. {
  20. return adopt_ref(*new T(forward<Args>(args)...));
  21. }
  22. Vector<Command> Parser::parse_graphics_commands(ReadonlyBytes bytes)
  23. {
  24. auto parser = adopt_ref(*new Parser(bytes));
  25. return parser->parse_graphics_commands();
  26. }
  27. Parser::Parser(Badge<Document>, ReadonlyBytes bytes)
  28. : m_reader(bytes)
  29. {
  30. }
  31. Parser::Parser(ReadonlyBytes bytes)
  32. : m_reader(bytes)
  33. {
  34. }
  35. void Parser::set_document(WeakPtr<Document> const& document)
  36. {
  37. m_document = document;
  38. }
  39. bool Parser::initialize()
  40. {
  41. if (!parse_header())
  42. return {};
  43. const auto result = initialize_linearization_dict();
  44. if (result == LinearizationResult::Error)
  45. return {};
  46. if (result == LinearizationResult::NotLinearized)
  47. return initialize_non_linearized_xref_table();
  48. bool is_linearized = m_linearization_dictionary.has_value();
  49. if (is_linearized) {
  50. // The file may have been linearized at one point, but could have been updated afterwards,
  51. // which means it is no longer a linearized PDF file.
  52. is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
  53. if (!is_linearized) {
  54. // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
  55. // split. This might take some tweaking to ensure correct behavior, which can be
  56. // implemented later.
  57. TODO();
  58. }
  59. }
  60. if (is_linearized)
  61. return initialize_linearized_xref_table();
  62. return initialize_non_linearized_xref_table();
  63. }
  64. Value Parser::parse_object_with_index(u32 index)
  65. {
  66. VERIFY(m_xref_table->has_object(index));
  67. auto byte_offset = m_xref_table->byte_offset_for_object(index);
  68. m_reader.move_to(byte_offset);
  69. auto indirect_value = parse_indirect_value();
  70. VERIFY(indirect_value);
  71. VERIFY(indirect_value->index() == index);
  72. return indirect_value->value();
  73. }
  74. bool Parser::parse_header()
  75. {
  76. // FIXME: Do something with the version?
  77. m_reader.set_reading_forwards();
  78. if (m_reader.remaining() == 0)
  79. return false;
  80. m_reader.move_to(0);
  81. if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
  82. return false;
  83. m_reader.move_by(5);
  84. char major_ver = m_reader.read();
  85. if (major_ver != '1' && major_ver != '2')
  86. return false;
  87. if (m_reader.read() != '.')
  88. return false;
  89. char minor_ver = m_reader.read();
  90. if (minor_ver < '0' || minor_ver > '7')
  91. return false;
  92. consume_eol();
  93. // Parse optional high-byte comment, which signifies a binary file
  94. // FIXME: Do something with this?
  95. auto comment = parse_comment();
  96. if (!comment.is_empty()) {
  97. auto binary = comment.length() >= 4;
  98. if (binary) {
  99. for (size_t i = 0; i < comment.length() && binary; i++)
  100. binary = static_cast<u8>(comment[i]) > 128;
  101. }
  102. }
  103. return true;
  104. }
  105. Parser::LinearizationResult Parser::initialize_linearization_dict()
  106. {
  107. // parse_header() is called immediately before this, so we are at the right location
  108. auto dict_value = m_document->resolve(parse_indirect_value());
  109. if (!dict_value.has<NonnullRefPtr<Object>>())
  110. return LinearizationResult::Error;
  111. auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
  112. if (!dict_object->is_dict())
  113. return LinearizationResult::NotLinearized;
  114. auto dict = object_cast<DictObject>(dict_object);
  115. if (!dict->contains(CommonNames::Linearized))
  116. return LinearizationResult::NotLinearized;
  117. if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
  118. return LinearizationResult::Error;
  119. auto length_of_file = dict->get_value(CommonNames::L);
  120. auto hint_table = dict->get_value(CommonNames::H);
  121. auto first_page_object_number = dict->get_value(CommonNames::O);
  122. auto offset_of_first_page_end = dict->get_value(CommonNames::E);
  123. auto number_of_pages = dict->get_value(CommonNames::N);
  124. auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
  125. auto first_page = dict->get(CommonNames::P).value_or({});
  126. // Validation
  127. if (!length_of_file.has_u32()
  128. || !hint_table.has<NonnullRefPtr<Object>>()
  129. || !first_page_object_number.has_u32()
  130. || !number_of_pages.has_u16()
  131. || !offset_of_main_xref_table.has_u32()
  132. || (!first_page.has<Empty>() && !first_page.has_u32())) {
  133. return LinearizationResult::Error;
  134. }
  135. auto hint_table_object = hint_table.get<NonnullRefPtr<Object>>();
  136. if (!hint_table_object->is_array())
  137. return LinearizationResult::Error;
  138. auto hint_table_array = object_cast<ArrayObject>(hint_table_object);
  139. auto hint_table_size = hint_table_array->size();
  140. if (hint_table_size != 2 && hint_table_size != 4)
  141. return LinearizationResult::Error;
  142. auto primary_hint_stream_offset = hint_table_array->at(0);
  143. auto primary_hint_stream_length = hint_table_array->at(1);
  144. Value overflow_hint_stream_offset;
  145. Value overflow_hint_stream_length;
  146. if (hint_table_size == 4) {
  147. overflow_hint_stream_offset = hint_table_array->at(2);
  148. overflow_hint_stream_length = hint_table_array->at(3);
  149. }
  150. if (!primary_hint_stream_offset.has_u32()
  151. || !primary_hint_stream_length.has_u32()
  152. || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
  153. || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
  154. return LinearizationResult::Error;
  155. }
  156. m_linearization_dictionary = LinearizationDictionary {
  157. length_of_file.get_u32(),
  158. primary_hint_stream_offset.get_u32(),
  159. primary_hint_stream_length.get_u32(),
  160. overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
  161. overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
  162. first_page_object_number.get_u32(),
  163. offset_of_first_page_end.get_u32(),
  164. number_of_pages.get_u16(),
  165. offset_of_main_xref_table.get_u32(),
  166. first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
  167. };
  168. return LinearizationResult::Linearized;
  169. }
  170. bool Parser::initialize_linearized_xref_table()
  171. {
  172. // The linearization parameter dictionary has just been parsed, and the xref table
  173. // comes immediately after it. We are in the correct spot.
  174. if (!m_reader.matches("xref"))
  175. return false;
  176. m_xref_table = parse_xref_table();
  177. if (!m_xref_table)
  178. return false;
  179. m_trailer = parse_file_trailer();
  180. if (!m_trailer)
  181. return false;
  182. // Also parse the main xref table and merge into the first-page xref table. Note
  183. // that we don't use the main xref table offset from the linearization dict because
  184. // for some reason, it specified the offset of the whitespace after the object
  185. // index start and length? So it's much easier to do it this way.
  186. auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
  187. m_reader.move_to(main_xref_table_offset);
  188. auto main_xref_table = parse_xref_table();
  189. if (!main_xref_table)
  190. return false;
  191. return m_xref_table->merge(move(*main_xref_table));
  192. }
  193. bool Parser::initialize_hint_tables()
  194. {
  195. auto linearization_dict = m_linearization_dictionary.value();
  196. auto primary_offset = linearization_dict.primary_hint_stream_offset;
  197. auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
  198. auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
  199. m_reader.move_to(offset);
  200. auto stream_indirect_value = parse_indirect_value();
  201. if (!stream_indirect_value)
  202. return {};
  203. auto stream_value = stream_indirect_value->value();
  204. if (!stream_value.has<NonnullRefPtr<Object>>())
  205. return {};
  206. auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
  207. if (!stream_object->is_stream())
  208. return {};
  209. return object_cast<StreamObject>(stream_object);
  210. };
  211. auto primary_hint_stream = parse_hint_table(primary_offset);
  212. if (!primary_hint_stream)
  213. return false;
  214. RefPtr<StreamObject> overflow_hint_stream;
  215. if (overflow_offset != NumericLimits<u32>::max())
  216. overflow_hint_stream = parse_hint_table(overflow_offset);
  217. ByteBuffer possible_merged_stream_buffer;
  218. ReadonlyBytes hint_stream_bytes;
  219. if (overflow_hint_stream) {
  220. auto primary_size = primary_hint_stream->bytes().size();
  221. auto overflow_size = overflow_hint_stream->bytes().size();
  222. auto total_size = primary_size + overflow_size;
  223. auto buffer_result = ByteBuffer::create_uninitialized(total_size);
  224. if (!buffer_result.has_value())
  225. return false;
  226. possible_merged_stream_buffer = buffer_result.release_value();
  227. auto ok = !possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()).is_error();
  228. ok = ok && !possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()).is_error();
  229. if (!ok)
  230. return false;
  231. hint_stream_bytes = possible_merged_stream_buffer.bytes();
  232. } else {
  233. hint_stream_bytes = primary_hint_stream->bytes();
  234. }
  235. auto hint_table = parse_page_offset_hint_table(hint_stream_bytes);
  236. if (!hint_table.has_value())
  237. return false;
  238. dbgln("hint table: {}", hint_table.value());
  239. auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table.value(), hint_stream_bytes);
  240. if (!hint_table_entries.has_value())
  241. return false;
  242. auto entries = hint_table_entries.value();
  243. dbgln("hint table entries size: {}", entries.size());
  244. for (auto& entry : entries)
  245. dbgln("{}", entry);
  246. return true;
  247. }
  248. bool Parser::initialize_non_linearized_xref_table()
  249. {
  250. m_reader.move_to(m_reader.bytes().size() - 1);
  251. if (!navigate_to_before_eof_marker())
  252. return false;
  253. if (!navigate_to_after_startxref())
  254. return false;
  255. if (m_reader.done())
  256. return false;
  257. m_reader.set_reading_forwards();
  258. auto xref_offset_value = parse_number();
  259. if (!xref_offset_value.has<int>())
  260. return false;
  261. auto xref_offset = xref_offset_value.get<int>();
  262. m_reader.move_to(xref_offset);
  263. m_xref_table = parse_xref_table();
  264. if (!m_xref_table)
  265. return false;
  266. m_trailer = parse_file_trailer();
  267. return m_trailer;
  268. }
  269. RefPtr<XRefTable> Parser::parse_xref_table()
  270. {
  271. if (!m_reader.matches("xref"))
  272. return {};
  273. m_reader.move_by(4);
  274. if (!consume_eol())
  275. return {};
  276. auto table = adopt_ref(*new XRefTable());
  277. while (true) {
  278. if (m_reader.matches("trailer"))
  279. return table;
  280. Vector<XRefEntry> entries;
  281. auto starting_index_value = parse_number();
  282. auto starting_index = starting_index_value.get<int>();
  283. auto object_count_value = parse_number();
  284. auto object_count = object_count_value.get<int>();
  285. for (int i = 0; i < object_count; i++) {
  286. auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
  287. m_reader.move_by(10);
  288. if (!consume(' '))
  289. return {};
  290. auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
  291. m_reader.move_by(5);
  292. if (!consume(' '))
  293. return {};
  294. auto letter = m_reader.read();
  295. if (letter != 'n' && letter != 'f')
  296. return {};
  297. // The line ending sequence can be one of the following:
  298. // SP CR, SP LF, or CR LF
  299. if (m_reader.matches(' ')) {
  300. consume();
  301. auto ch = consume();
  302. if (ch != '\r' && ch != '\n')
  303. return {};
  304. } else {
  305. if (!m_reader.matches("\r\n"))
  306. return {};
  307. m_reader.move_by(2);
  308. }
  309. auto offset = strtol(offset_string.characters(), nullptr, 10);
  310. auto generation = strtol(generation_string.characters(), nullptr, 10);
  311. entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
  312. }
  313. table->add_section({ starting_index, object_count, entries });
  314. }
  315. }
  316. RefPtr<DictObject> Parser::parse_file_trailer()
  317. {
  318. if (!m_reader.matches("trailer"))
  319. return {};
  320. m_reader.move_by(7);
  321. consume_whitespace();
  322. auto dict = parse_dict();
  323. if (!dict)
  324. return {};
  325. if (!m_reader.matches("startxref"))
  326. return {};
  327. m_reader.move_by(9);
  328. consume_whitespace();
  329. m_reader.move_until([&](auto) { return matches_eol(); });
  330. VERIFY(consume_eol());
  331. if (!m_reader.matches("%%EOF"))
  332. return {};
  333. m_reader.move_by(5);
  334. consume_whitespace();
  335. return dict;
  336. }
  337. Optional<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
  338. {
  339. if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
  340. return {};
  341. size_t offset = 0;
  342. auto read_u32 = [&] {
  343. u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
  344. offset += 4;
  345. return AK::convert_between_host_and_big_endian(data);
  346. };
  347. auto read_u16 = [&] {
  348. u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
  349. offset += 2;
  350. return AK::convert_between_host_and_big_endian(data);
  351. };
  352. PageOffsetHintTable hint_table {
  353. read_u32(),
  354. read_u32(),
  355. read_u16(),
  356. read_u32(),
  357. read_u16(),
  358. read_u32(),
  359. read_u16(),
  360. read_u32(),
  361. read_u16(),
  362. read_u16(),
  363. read_u16(),
  364. read_u16(),
  365. read_u16(),
  366. };
  367. // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
  368. // fields in PageOffsetHintTableEntry are u32
  369. VERIFY(hint_table.bits_required_for_object_number <= 32);
  370. VERIFY(hint_table.bits_required_for_page_length <= 32);
  371. VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
  372. VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
  373. VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
  374. VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
  375. VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
  376. return hint_table;
  377. }
  378. Optional<Vector<Parser::PageOffsetHintTableEntry>> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
  379. {
  380. InputMemoryStream input_stream(hint_stream_bytes);
  381. input_stream.seek(sizeof(PageOffsetHintTable));
  382. if (input_stream.has_any_error())
  383. return {};
  384. InputBitStream bit_stream(input_stream);
  385. auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
  386. Vector<PageOffsetHintTableEntry> entries;
  387. for (size_t i = 0; i < number_of_pages; i++)
  388. entries.append(PageOffsetHintTableEntry {});
  389. auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
  390. auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
  391. auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
  392. auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
  393. auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
  394. auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
  395. auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
  396. auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
  397. if (bit_size <= 0)
  398. return;
  399. for (int i = 0; i < number_of_pages; i++) {
  400. auto& entry = entries[i];
  401. entry.*field = bit_stream.read_bits(bit_size);
  402. }
  403. };
  404. auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
  405. if (bit_size <= 0)
  406. return;
  407. for (int page = 1; page < number_of_pages; page++) {
  408. auto number_of_shared_objects = entries[page].number_of_shared_objects;
  409. Vector<u32> items;
  410. items.ensure_capacity(number_of_shared_objects);
  411. for (size_t i = 0; i < number_of_shared_objects; i++)
  412. items.unchecked_append(bit_stream.read_bits(bit_size));
  413. entries[page].*field = move(items);
  414. }
  415. };
  416. parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
  417. parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
  418. parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
  419. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
  420. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
  421. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
  422. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
  423. return entries;
  424. }
  425. bool Parser::navigate_to_before_eof_marker()
  426. {
  427. m_reader.set_reading_backwards();
  428. while (!m_reader.done()) {
  429. m_reader.move_until([&](auto) { return matches_eol(); });
  430. if (m_reader.done())
  431. return false;
  432. consume_eol();
  433. if (!m_reader.matches("%%EOF"))
  434. continue;
  435. m_reader.move_by(5);
  436. if (!matches_eol())
  437. continue;
  438. consume_eol();
  439. return true;
  440. }
  441. return false;
  442. }
  443. bool Parser::navigate_to_after_startxref()
  444. {
  445. m_reader.set_reading_backwards();
  446. while (!m_reader.done()) {
  447. m_reader.move_until([&](auto) { return matches_eol(); });
  448. auto offset = m_reader.offset() + 1;
  449. consume_eol();
  450. if (!m_reader.matches("startxref"))
  451. continue;
  452. m_reader.move_by(9);
  453. if (!matches_eol())
  454. continue;
  455. m_reader.move_to(offset);
  456. return true;
  457. }
  458. return false;
  459. }
  460. bool Parser::sloppy_is_linearized()
  461. {
  462. ScopeGuard guard([&] {
  463. m_reader.move_to(0);
  464. m_reader.set_reading_forwards();
  465. });
  466. auto limit = min(1024ul, m_reader.bytes().size() - 1);
  467. m_reader.move_to(limit);
  468. m_reader.set_reading_backwards();
  469. while (!m_reader.done()) {
  470. m_reader.move_until('/');
  471. if (m_reader.matches("/Linearized"))
  472. return true;
  473. m_reader.move_by(1);
  474. }
  475. return false;
  476. }
  477. String Parser::parse_comment()
  478. {
  479. if (!m_reader.matches('%'))
  480. return {};
  481. consume();
  482. auto comment_start_offset = m_reader.offset();
  483. m_reader.move_until([&](auto) {
  484. return matches_eol();
  485. });
  486. String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
  487. consume_eol();
  488. consume_whitespace();
  489. return str;
  490. }
  491. Value Parser::parse_value()
  492. {
  493. parse_comment();
  494. if (m_reader.matches("null")) {
  495. m_reader.move_by(4);
  496. consume_whitespace();
  497. return Value(nullptr);
  498. }
  499. if (m_reader.matches("true")) {
  500. m_reader.move_by(4);
  501. consume_whitespace();
  502. return Value(true);
  503. }
  504. if (m_reader.matches("false")) {
  505. m_reader.move_by(5);
  506. consume_whitespace();
  507. return Value(false);
  508. }
  509. if (matches_number())
  510. return parse_possible_indirect_value_or_ref();
  511. if (m_reader.matches('/'))
  512. return parse_name();
  513. if (m_reader.matches("<<")) {
  514. auto dict = parse_dict();
  515. if (!dict)
  516. return {};
  517. if (m_reader.matches("stream"))
  518. return parse_stream(dict.release_nonnull());
  519. return dict;
  520. }
  521. if (m_reader.matches_any('(', '<'))
  522. return parse_string();
  523. if (m_reader.matches('['))
  524. return parse_array();
  525. dbgln("tried to parse value, but found char {} ({}) at offset {}", m_reader.peek(), static_cast<u8>(m_reader.peek()), m_reader.offset());
  526. VERIFY_NOT_REACHED();
  527. }
  528. Value Parser::parse_possible_indirect_value_or_ref()
  529. {
  530. auto first_number = parse_number();
  531. if (!first_number.has<int>() || !matches_number())
  532. return first_number;
  533. m_reader.save();
  534. auto second_number = parse_number();
  535. if (!second_number.has<int>()) {
  536. m_reader.load();
  537. return first_number;
  538. }
  539. if (m_reader.matches('R')) {
  540. m_reader.discard();
  541. consume();
  542. consume_whitespace();
  543. return Value(Reference(first_number.get<int>(), second_number.get<int>()));
  544. }
  545. if (m_reader.matches("obj")) {
  546. m_reader.discard();
  547. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  548. }
  549. m_reader.load();
  550. return first_number;
  551. }
  552. RefPtr<IndirectValue> Parser::parse_indirect_value(int index, int generation)
  553. {
  554. if (!m_reader.matches("obj"))
  555. return {};
  556. m_reader.move_by(3);
  557. if (matches_eol())
  558. consume_eol();
  559. auto value = parse_value();
  560. if (!m_reader.matches("endobj"))
  561. return {};
  562. consume(6);
  563. consume_whitespace();
  564. return make_object<IndirectValue>(index, generation, value);
  565. }
  566. RefPtr<IndirectValue> Parser::parse_indirect_value()
  567. {
  568. auto first_number = parse_number();
  569. if (!first_number.has<int>())
  570. return {};
  571. auto second_number = parse_number();
  572. if (!second_number.has<int>())
  573. return {};
  574. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  575. }
  576. Value Parser::parse_number()
  577. {
  578. size_t start_offset = m_reader.offset();
  579. bool is_float = false;
  580. if (m_reader.matches('+') || m_reader.matches('-'))
  581. consume();
  582. while (!m_reader.done()) {
  583. if (m_reader.matches('.')) {
  584. if (is_float)
  585. break;
  586. is_float = true;
  587. consume();
  588. } else if (isdigit(m_reader.peek())) {
  589. consume();
  590. } else {
  591. break;
  592. }
  593. }
  594. consume_whitespace();
  595. auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
  596. float f = strtof(string.characters(), nullptr);
  597. if (is_float)
  598. return Value(f);
  599. VERIFY(floorf(f) == f);
  600. return Value(static_cast<int>(f));
  601. }
  602. RefPtr<NameObject> Parser::parse_name()
  603. {
  604. if (!consume('/'))
  605. return {};
  606. StringBuilder builder;
  607. while (true) {
  608. if (!matches_regular_character())
  609. break;
  610. if (m_reader.matches('#')) {
  611. int hex_value = 0;
  612. for (int i = 0; i < 2; i++) {
  613. auto ch = consume();
  614. if (!isxdigit(ch))
  615. return {};
  616. hex_value *= 16;
  617. if (ch <= '9') {
  618. hex_value += ch - '0';
  619. } else {
  620. hex_value += ch - 'A' + 10;
  621. }
  622. }
  623. builder.append(static_cast<char>(hex_value));
  624. continue;
  625. }
  626. builder.append(consume());
  627. }
  628. consume_whitespace();
  629. return make_object<NameObject>(builder.to_string());
  630. }
  631. RefPtr<StringObject> Parser::parse_string()
  632. {
  633. ScopeGuard guard([&] { consume_whitespace(); });
  634. String string;
  635. bool is_binary_string;
  636. if (m_reader.matches('(')) {
  637. string = parse_literal_string();
  638. is_binary_string = false;
  639. } else {
  640. string = parse_hex_string();
  641. is_binary_string = true;
  642. }
  643. if (string.is_null())
  644. return {};
  645. if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
  646. // The string is encoded in UTF16-BE
  647. string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
  648. } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
  649. // The string is encoded in UTF-8. This is the default anyways, but if these bytes
  650. // are explicitly included, we have to trim them
  651. string = string.substring(3);
  652. }
  653. return make_object<StringObject>(string, is_binary_string);
  654. }
  655. String Parser::parse_literal_string()
  656. {
  657. if (!consume('('))
  658. return {};
  659. StringBuilder builder;
  660. auto opened_parens = 0;
  661. while (true) {
  662. if (m_reader.matches('(')) {
  663. opened_parens++;
  664. builder.append(consume());
  665. } else if (m_reader.matches(')')) {
  666. consume();
  667. if (opened_parens == 0)
  668. break;
  669. opened_parens--;
  670. builder.append(')');
  671. } else if (m_reader.matches('\\')) {
  672. consume();
  673. if (matches_eol()) {
  674. consume_eol();
  675. continue;
  676. }
  677. if (m_reader.done())
  678. return {};
  679. auto ch = consume();
  680. switch (ch) {
  681. case 'n':
  682. builder.append('\n');
  683. break;
  684. case 'r':
  685. builder.append('\r');
  686. break;
  687. case 't':
  688. builder.append('\t');
  689. break;
  690. case 'b':
  691. builder.append('\b');
  692. break;
  693. case 'f':
  694. builder.append('\f');
  695. break;
  696. case '(':
  697. builder.append('(');
  698. break;
  699. case ')':
  700. builder.append(')');
  701. break;
  702. case '\\':
  703. builder.append('\\');
  704. break;
  705. default: {
  706. if (ch >= '0' && ch <= '7') {
  707. int octal_value = ch - '0';
  708. for (int i = 0; i < 2; i++) {
  709. auto octal_ch = consume();
  710. if (octal_ch < '0' || octal_ch > '7')
  711. break;
  712. octal_value = octal_value * 8 + (octal_ch - '0');
  713. }
  714. builder.append(static_cast<char>(octal_value));
  715. } else {
  716. builder.append(ch);
  717. }
  718. }
  719. }
  720. } else if (matches_eol()) {
  721. consume_eol();
  722. builder.append('\n');
  723. } else {
  724. builder.append(consume());
  725. }
  726. }
  727. if (opened_parens != 0)
  728. return {};
  729. return builder.to_string();
  730. }
  731. String Parser::parse_hex_string()
  732. {
  733. if (!consume('<'))
  734. return {};
  735. StringBuilder builder;
  736. while (true) {
  737. if (m_reader.matches('>')) {
  738. consume();
  739. return builder.to_string();
  740. } else {
  741. int hex_value = 0;
  742. for (int i = 0; i < 2; i++) {
  743. auto ch = consume();
  744. if (ch == '>') {
  745. // The hex string contains an odd number of characters, and the last character
  746. // is assumed to be '0'
  747. consume();
  748. hex_value *= 16;
  749. builder.append(static_cast<char>(hex_value));
  750. return builder.to_string();
  751. }
  752. if (!isxdigit(ch))
  753. return {};
  754. hex_value *= 16;
  755. if (ch <= '9') {
  756. hex_value += ch - '0';
  757. } else {
  758. hex_value += ch - 'A' + 10;
  759. }
  760. }
  761. builder.append(static_cast<char>(hex_value));
  762. }
  763. }
  764. }
  765. RefPtr<ArrayObject> Parser::parse_array()
  766. {
  767. if (!consume('['))
  768. return {};
  769. consume_whitespace();
  770. Vector<Value> values;
  771. while (!m_reader.matches(']')) {
  772. auto value = parse_value();
  773. if (value.has<Empty>())
  774. return {};
  775. values.append(value);
  776. }
  777. if (!consume(']'))
  778. return {};
  779. consume_whitespace();
  780. return make_object<ArrayObject>(values);
  781. }
  782. RefPtr<DictObject> Parser::parse_dict()
  783. {
  784. if (!consume('<') || !consume('<'))
  785. return {};
  786. consume_whitespace();
  787. HashMap<FlyString, Value> map;
  788. while (true) {
  789. if (m_reader.matches(">>"))
  790. break;
  791. auto name = parse_name();
  792. if (!name)
  793. return {};
  794. auto value = parse_value();
  795. if (value.has<Empty>())
  796. return {};
  797. map.set(name->name(), value);
  798. }
  799. if (!consume('>') || !consume('>'))
  800. return {};
  801. consume_whitespace();
  802. return make_object<DictObject>(map);
  803. }
  804. RefPtr<DictObject> Parser::conditionally_parse_page_tree_node(u32 object_index, bool& ok)
  805. {
  806. ok = true;
  807. VERIFY(m_xref_table->has_object(object_index));
  808. auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
  809. m_reader.move_to(byte_offset);
  810. parse_number();
  811. parse_number();
  812. if (!m_reader.matches("obj")) {
  813. ok = false;
  814. return {};
  815. }
  816. m_reader.move_by(3);
  817. consume_whitespace();
  818. if (!consume('<') || !consume('<'))
  819. return {};
  820. consume_whitespace();
  821. HashMap<FlyString, Value> map;
  822. while (true) {
  823. if (m_reader.matches(">>"))
  824. break;
  825. auto name = parse_name();
  826. if (!name) {
  827. ok = false;
  828. return {};
  829. }
  830. auto name_string = name->name();
  831. if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
  832. // This is a page, not a page tree node
  833. return {};
  834. }
  835. auto value = parse_value();
  836. if (value.has<Empty>()) {
  837. ok = false;
  838. return {};
  839. }
  840. if (name_string == CommonNames::Type) {
  841. if (!value.has<NonnullRefPtr<Object>>())
  842. return {};
  843. auto type_object = value.get<NonnullRefPtr<Object>>();
  844. if (!type_object->is_name())
  845. return {};
  846. auto type_name = object_cast<NameObject>(type_object);
  847. if (type_name->name() != CommonNames::Pages)
  848. return {};
  849. }
  850. map.set(name->name(), value);
  851. }
  852. if (!consume('>') || !consume('>'))
  853. return {};
  854. consume_whitespace();
  855. return make_object<DictObject>(map);
  856. }
  857. RefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
  858. {
  859. if (!m_reader.matches("stream"))
  860. return {};
  861. m_reader.move_by(6);
  862. if (!consume_eol())
  863. return {};
  864. ReadonlyBytes bytes;
  865. auto maybe_length = dict->get(CommonNames::Length);
  866. if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) {
  867. // The PDF writer has kindly provided us with the direct length of the stream
  868. m_reader.save();
  869. auto length = m_document->resolve_to<int>(maybe_length.value());
  870. m_reader.load();
  871. bytes = m_reader.bytes().slice(m_reader.offset(), length);
  872. m_reader.move_by(length);
  873. consume_whitespace();
  874. } else {
  875. // We have to look for the endstream keyword
  876. auto stream_start = m_reader.offset();
  877. while (true) {
  878. m_reader.move_until([&](auto) { return matches_eol(); });
  879. auto potential_stream_end = m_reader.offset();
  880. consume_eol();
  881. if (!m_reader.matches("endstream"))
  882. continue;
  883. bytes = m_reader.bytes().slice(stream_start, potential_stream_end - stream_start);
  884. break;
  885. }
  886. }
  887. m_reader.move_by(9);
  888. consume_whitespace();
  889. if (dict->contains(CommonNames::Filter)) {
  890. auto filter_type = dict->get_name(m_document, CommonNames::Filter)->name();
  891. auto maybe_bytes = Filter::decode(bytes, filter_type);
  892. if (!maybe_bytes.has_value())
  893. return {};
  894. return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
  895. }
  896. return make_object<PlainTextStreamObject>(dict, bytes);
  897. }
  898. Vector<Command> Parser::parse_graphics_commands()
  899. {
  900. Vector<Command> commands;
  901. Vector<Value> command_args;
  902. constexpr static auto is_command_char = [](char ch) {
  903. return isalpha(ch) || ch == '*' || ch == '\'';
  904. };
  905. while (!m_reader.done()) {
  906. auto ch = m_reader.peek();
  907. if (is_command_char(ch)) {
  908. auto command_start = m_reader.offset();
  909. while (is_command_char(ch)) {
  910. consume();
  911. if (m_reader.done())
  912. break;
  913. ch = m_reader.peek();
  914. }
  915. auto command_string = StringView(m_reader.bytes().slice(command_start, m_reader.offset() - command_start));
  916. auto command_type = Command::command_type_from_symbol(command_string);
  917. commands.append(Command(command_type, move(command_args)));
  918. command_args = Vector<Value>();
  919. consume_whitespace();
  920. continue;
  921. }
  922. command_args.append(parse_value());
  923. }
  924. return commands;
  925. }
  926. bool Parser::matches_eol() const
  927. {
  928. return m_reader.matches_any(0xa, 0xd);
  929. }
  930. bool Parser::matches_whitespace() const
  931. {
  932. return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
  933. }
  934. bool Parser::matches_number() const
  935. {
  936. if (m_reader.done())
  937. return false;
  938. auto ch = m_reader.peek();
  939. return isdigit(ch) || ch == '-' || ch == '+';
  940. }
  941. bool Parser::matches_delimiter() const
  942. {
  943. return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%');
  944. }
  945. bool Parser::matches_regular_character() const
  946. {
  947. return !matches_delimiter() && !matches_whitespace();
  948. }
  949. bool Parser::consume_eol()
  950. {
  951. if (m_reader.done()) {
  952. return false;
  953. }
  954. if (m_reader.matches("\r\n")) {
  955. consume(2);
  956. return true;
  957. }
  958. auto consumed = consume();
  959. return consumed == 0xd || consumed == 0xa;
  960. }
  961. bool Parser::consume_whitespace()
  962. {
  963. bool consumed = false;
  964. while (matches_whitespace()) {
  965. consumed = true;
  966. consume();
  967. }
  968. return consumed;
  969. }
  970. char Parser::consume()
  971. {
  972. return m_reader.read();
  973. }
  974. void Parser::consume(int amount)
  975. {
  976. for (size_t i = 0; i < static_cast<size_t>(amount); i++)
  977. consume();
  978. }
  979. bool Parser::consume(char ch)
  980. {
  981. return consume() == ch;
  982. }
  983. }
  984. namespace AK {
  985. template<>
  986. struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
  987. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict)
  988. {
  989. StringBuilder builder;
  990. builder.append("{\n");
  991. builder.appendff(" length_of_file={}\n", dict.length_of_file);
  992. builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
  993. builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
  994. builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
  995. builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
  996. builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
  997. builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
  998. builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
  999. builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
  1000. builder.appendff(" first_page={}\n", dict.first_page);
  1001. builder.append('}');
  1002. return Formatter<StringView>::format(format_builder, builder.to_string());
  1003. }
  1004. };
  1005. template<>
  1006. struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
  1007. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table)
  1008. {
  1009. StringBuilder builder;
  1010. builder.append("{\n");
  1011. builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
  1012. builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
  1013. builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
  1014. builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
  1015. builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
  1016. builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
  1017. builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
  1018. builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
  1019. builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
  1020. builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
  1021. builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
  1022. builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
  1023. builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
  1024. builder.append('}');
  1025. return Formatter<StringView>::format(format_builder, builder.to_string());
  1026. }
  1027. };
  1028. template<>
  1029. struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
  1030. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry)
  1031. {
  1032. StringBuilder builder;
  1033. builder.append("{\n");
  1034. builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
  1035. builder.appendff(" page_length_number={}\n", entry.page_length_number);
  1036. builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
  1037. builder.append(" shared_object_identifiers=[");
  1038. for (auto& identifier : entry.shared_object_identifiers)
  1039. builder.appendff(" {}", identifier);
  1040. builder.append(" ]\n");
  1041. builder.append(" shared_object_location_numerators=[");
  1042. for (auto& numerator : entry.shared_object_location_numerators)
  1043. builder.appendff(" {}", numerator);
  1044. builder.append(" ]\n");
  1045. builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
  1046. builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
  1047. builder.append('}');
  1048. return Formatter<StringView>::format(format_builder, builder.to_string());
  1049. }
  1050. };
  1051. }