Parser.cpp 38 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. /*
  2. * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BitStream.h>
  7. #include <AK/MemoryStream.h>
  8. #include <AK/ScopeGuard.h>
  9. #include <LibPDF/CommonNames.h>
  10. #include <LibPDF/Document.h>
  11. #include <LibPDF/Filter.h>
  12. #include <LibPDF/Parser.h>
  13. #include <LibTextCodec/Decoder.h>
  14. #include <ctype.h>
  15. namespace PDF {
  16. template<typename T, typename... Args>
  17. static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
  18. {
  19. return adopt_ref(*new T(forward<Args>(args)...));
  20. }
  21. Vector<Command> Parser::parse_graphics_commands(ReadonlyBytes bytes)
  22. {
  23. auto parser = adopt_ref(*new Parser(bytes));
  24. return parser->parse_graphics_commands();
  25. }
  26. Parser::Parser(Badge<Document>, ReadonlyBytes bytes)
  27. : m_reader(bytes)
  28. {
  29. }
  30. Parser::Parser(ReadonlyBytes bytes)
  31. : m_reader(bytes)
  32. {
  33. }
  34. void Parser::set_document(WeakPtr<Document> const& document)
  35. {
  36. m_document = document;
  37. }
  38. bool Parser::initialize()
  39. {
  40. if (!parse_header())
  41. return {};
  42. const auto result = initialize_linearization_dict();
  43. if (result == LinearizationResult::Error)
  44. return {};
  45. if (result == LinearizationResult::NotLinearized)
  46. return initialize_non_linearized_xref_table();
  47. bool is_linearized = m_linearization_dictionary.has_value();
  48. if (is_linearized) {
  49. // The file may have been linearized at one point, but could have been updated afterwards,
  50. // which means it is no longer a linearized PDF file.
  51. is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
  52. if (!is_linearized) {
  53. // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
  54. // split. This might take some tweaking to ensure correct behavior, which can be
  55. // implemented later.
  56. TODO();
  57. }
  58. }
  59. if (is_linearized)
  60. return initialize_linearized_xref_table();
  61. return initialize_non_linearized_xref_table();
  62. }
  63. Value Parser::parse_object_with_index(u32 index)
  64. {
  65. VERIFY(m_xref_table->has_object(index));
  66. auto byte_offset = m_xref_table->byte_offset_for_object(index);
  67. m_reader.move_to(byte_offset);
  68. auto indirect_value = parse_indirect_value();
  69. VERIFY(indirect_value);
  70. VERIFY(indirect_value->index() == index);
  71. return indirect_value->value();
  72. }
  73. bool Parser::parse_header()
  74. {
  75. // FIXME: Do something with the version?
  76. m_reader.set_reading_forwards();
  77. if (m_reader.remaining() == 0)
  78. return false;
  79. m_reader.move_to(0);
  80. if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
  81. return false;
  82. m_reader.move_by(5);
  83. char major_ver = m_reader.read();
  84. if (major_ver != '1' && major_ver != '2')
  85. return false;
  86. if (m_reader.read() != '.')
  87. return false;
  88. char minor_ver = m_reader.read();
  89. if (minor_ver < '0' || minor_ver > '7')
  90. return false;
  91. consume_eol();
  92. // Parse optional high-byte comment, which signifies a binary file
  93. // FIXME: Do something with this?
  94. auto comment = parse_comment();
  95. if (!comment.is_empty()) {
  96. auto binary = comment.length() >= 4;
  97. if (binary) {
  98. for (size_t i = 0; i < comment.length() && binary; i++)
  99. binary = static_cast<u8>(comment[i]) > 128;
  100. }
  101. }
  102. return true;
  103. }
  104. Parser::LinearizationResult Parser::initialize_linearization_dict()
  105. {
  106. // parse_header() is called immediately before this, so we are at the right location
  107. auto dict_value = m_document->resolve(parse_indirect_value());
  108. if (!dict_value.has<NonnullRefPtr<Object>>())
  109. return LinearizationResult::Error;
  110. auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
  111. if (!dict_object->is_dict())
  112. return LinearizationResult::NotLinearized;
  113. auto dict = object_cast<DictObject>(dict_object);
  114. if (!dict->contains(CommonNames::Linearized))
  115. return LinearizationResult::NotLinearized;
  116. if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
  117. return LinearizationResult::Error;
  118. auto length_of_file = dict->get_value(CommonNames::L);
  119. auto hint_table = dict->get_value(CommonNames::H);
  120. auto first_page_object_number = dict->get_value(CommonNames::O);
  121. auto offset_of_first_page_end = dict->get_value(CommonNames::E);
  122. auto number_of_pages = dict->get_value(CommonNames::N);
  123. auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
  124. auto first_page = dict->get(CommonNames::P).value_or({});
  125. // Validation
  126. if (!length_of_file.has_u32()
  127. || !hint_table.has<NonnullRefPtr<Object>>()
  128. || !first_page_object_number.has_u32()
  129. || !number_of_pages.has_u16()
  130. || !offset_of_main_xref_table.has_u32()
  131. || (!first_page.has<Empty>() && !first_page.has_u32())) {
  132. return LinearizationResult::Error;
  133. }
  134. auto hint_table_object = hint_table.get<NonnullRefPtr<Object>>();
  135. if (!hint_table_object->is_array())
  136. return LinearizationResult::Error;
  137. auto hint_table_array = object_cast<ArrayObject>(hint_table_object);
  138. auto hint_table_size = hint_table_array->size();
  139. if (hint_table_size != 2 && hint_table_size != 4)
  140. return LinearizationResult::Error;
  141. auto primary_hint_stream_offset = hint_table_array->at(0);
  142. auto primary_hint_stream_length = hint_table_array->at(1);
  143. Value overflow_hint_stream_offset;
  144. Value overflow_hint_stream_length;
  145. if (hint_table_size == 4) {
  146. overflow_hint_stream_offset = hint_table_array->at(2);
  147. overflow_hint_stream_length = hint_table_array->at(3);
  148. }
  149. if (!primary_hint_stream_offset.has_u32()
  150. || !primary_hint_stream_length.has_u32()
  151. || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
  152. || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
  153. return LinearizationResult::Error;
  154. }
  155. m_linearization_dictionary = LinearizationDictionary {
  156. length_of_file.get_u32(),
  157. primary_hint_stream_offset.get_u32(),
  158. primary_hint_stream_length.get_u32(),
  159. overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
  160. overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
  161. first_page_object_number.get_u32(),
  162. offset_of_first_page_end.get_u32(),
  163. number_of_pages.get_u16(),
  164. offset_of_main_xref_table.get_u32(),
  165. first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
  166. };
  167. return LinearizationResult::Linearized;
  168. }
  169. bool Parser::initialize_linearized_xref_table()
  170. {
  171. // The linearization parameter dictionary has just been parsed, and the xref table
  172. // comes immediately after it. We are in the correct spot.
  173. if (!m_reader.matches("xref"))
  174. return false;
  175. m_xref_table = parse_xref_table();
  176. if (!m_xref_table)
  177. return false;
  178. m_trailer = parse_file_trailer();
  179. if (!m_trailer)
  180. return false;
  181. // Also parse the main xref table and merge into the first-page xref table. Note
  182. // that we don't use the main xref table offset from the linearization dict because
  183. // for some reason, it specified the offset of the whitespace after the object
  184. // index start and length? So it's much easier to do it this way.
  185. auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
  186. m_reader.move_to(main_xref_table_offset);
  187. auto main_xref_table = parse_xref_table();
  188. if (!main_xref_table)
  189. return false;
  190. return m_xref_table->merge(move(*main_xref_table));
  191. }
  192. bool Parser::initialize_hint_tables()
  193. {
  194. auto linearization_dict = m_linearization_dictionary.value();
  195. auto primary_offset = linearization_dict.primary_hint_stream_offset;
  196. auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
  197. auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
  198. m_reader.move_to(offset);
  199. auto stream_indirect_value = parse_indirect_value();
  200. if (!stream_indirect_value)
  201. return {};
  202. auto stream_value = stream_indirect_value->value();
  203. if (!stream_value.has<NonnullRefPtr<Object>>())
  204. return {};
  205. auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
  206. if (!stream_object->is_stream())
  207. return {};
  208. return object_cast<StreamObject>(stream_object);
  209. };
  210. auto primary_hint_stream = parse_hint_table(primary_offset);
  211. if (!primary_hint_stream)
  212. return false;
  213. RefPtr<StreamObject> overflow_hint_stream;
  214. if (overflow_offset != NumericLimits<u32>::max())
  215. overflow_hint_stream = parse_hint_table(overflow_offset);
  216. ByteBuffer possible_merged_stream_buffer;
  217. ReadonlyBytes hint_stream_bytes;
  218. if (overflow_hint_stream) {
  219. auto primary_size = primary_hint_stream->bytes().size();
  220. auto overflow_size = overflow_hint_stream->bytes().size();
  221. auto total_size = primary_size + overflow_size;
  222. auto buffer_result = ByteBuffer::create_uninitialized(total_size);
  223. if (buffer_result.is_error())
  224. return false;
  225. possible_merged_stream_buffer = buffer_result.release_value();
  226. auto ok = !possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()).is_error();
  227. ok = ok && !possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()).is_error();
  228. if (!ok)
  229. return false;
  230. hint_stream_bytes = possible_merged_stream_buffer.bytes();
  231. } else {
  232. hint_stream_bytes = primary_hint_stream->bytes();
  233. }
  234. auto hint_table = parse_page_offset_hint_table(hint_stream_bytes);
  235. if (!hint_table.has_value())
  236. return false;
  237. dbgln("hint table: {}", hint_table.value());
  238. auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table.value(), hint_stream_bytes);
  239. if (!hint_table_entries.has_value())
  240. return false;
  241. auto entries = hint_table_entries.value();
  242. dbgln("hint table entries size: {}", entries.size());
  243. for (auto& entry : entries)
  244. dbgln("{}", entry);
  245. return true;
  246. }
  247. bool Parser::initialize_non_linearized_xref_table()
  248. {
  249. m_reader.move_to(m_reader.bytes().size() - 1);
  250. if (!navigate_to_before_eof_marker())
  251. return false;
  252. if (!navigate_to_after_startxref())
  253. return false;
  254. if (m_reader.done())
  255. return false;
  256. m_reader.set_reading_forwards();
  257. auto xref_offset_value = parse_number();
  258. if (!xref_offset_value.has<int>())
  259. return false;
  260. auto xref_offset = xref_offset_value.get<int>();
  261. m_reader.move_to(xref_offset);
  262. m_xref_table = parse_xref_table();
  263. if (!m_xref_table)
  264. return false;
  265. m_trailer = parse_file_trailer();
  266. return m_trailer;
  267. }
  268. RefPtr<XRefTable> Parser::parse_xref_table()
  269. {
  270. if (!m_reader.matches("xref"))
  271. return {};
  272. m_reader.move_by(4);
  273. if (!consume_eol())
  274. return {};
  275. auto table = adopt_ref(*new XRefTable());
  276. while (true) {
  277. if (m_reader.matches("trailer"))
  278. return table;
  279. Vector<XRefEntry> entries;
  280. auto starting_index_value = parse_number();
  281. auto starting_index = starting_index_value.get<int>();
  282. auto object_count_value = parse_number();
  283. auto object_count = object_count_value.get<int>();
  284. for (int i = 0; i < object_count; i++) {
  285. auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
  286. m_reader.move_by(10);
  287. if (!consume(' '))
  288. return {};
  289. auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
  290. m_reader.move_by(5);
  291. if (!consume(' '))
  292. return {};
  293. auto letter = m_reader.read();
  294. if (letter != 'n' && letter != 'f')
  295. return {};
  296. // The line ending sequence can be one of the following:
  297. // SP CR, SP LF, or CR LF
  298. if (m_reader.matches(' ')) {
  299. consume();
  300. auto ch = consume();
  301. if (ch != '\r' && ch != '\n')
  302. return {};
  303. } else {
  304. if (!m_reader.matches("\r\n"))
  305. return {};
  306. m_reader.move_by(2);
  307. }
  308. auto offset = strtol(offset_string.characters(), nullptr, 10);
  309. auto generation = strtol(generation_string.characters(), nullptr, 10);
  310. entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
  311. }
  312. table->add_section({ starting_index, object_count, entries });
  313. }
  314. }
  315. RefPtr<DictObject> Parser::parse_file_trailer()
  316. {
  317. if (!m_reader.matches("trailer"))
  318. return {};
  319. m_reader.move_by(7);
  320. consume_whitespace();
  321. auto dict = parse_dict();
  322. if (!dict)
  323. return {};
  324. if (!m_reader.matches("startxref"))
  325. return {};
  326. m_reader.move_by(9);
  327. consume_whitespace();
  328. m_reader.move_until([&](auto) { return matches_eol(); });
  329. VERIFY(consume_eol());
  330. if (!m_reader.matches("%%EOF"))
  331. return {};
  332. m_reader.move_by(5);
  333. consume_whitespace();
  334. return dict;
  335. }
  336. Optional<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
  337. {
  338. if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
  339. return {};
  340. size_t offset = 0;
  341. auto read_u32 = [&] {
  342. u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
  343. offset += 4;
  344. return AK::convert_between_host_and_big_endian(data);
  345. };
  346. auto read_u16 = [&] {
  347. u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
  348. offset += 2;
  349. return AK::convert_between_host_and_big_endian(data);
  350. };
  351. PageOffsetHintTable hint_table {
  352. read_u32(),
  353. read_u32(),
  354. read_u16(),
  355. read_u32(),
  356. read_u16(),
  357. read_u32(),
  358. read_u16(),
  359. read_u32(),
  360. read_u16(),
  361. read_u16(),
  362. read_u16(),
  363. read_u16(),
  364. read_u16(),
  365. };
  366. // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
  367. // fields in PageOffsetHintTableEntry are u32
  368. VERIFY(hint_table.bits_required_for_object_number <= 32);
  369. VERIFY(hint_table.bits_required_for_page_length <= 32);
  370. VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
  371. VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
  372. VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
  373. VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
  374. VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
  375. return hint_table;
  376. }
  377. Optional<Vector<Parser::PageOffsetHintTableEntry>> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
  378. {
  379. InputMemoryStream input_stream(hint_stream_bytes);
  380. input_stream.seek(sizeof(PageOffsetHintTable));
  381. if (input_stream.has_any_error())
  382. return {};
  383. InputBitStream bit_stream(input_stream);
  384. auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
  385. Vector<PageOffsetHintTableEntry> entries;
  386. for (size_t i = 0; i < number_of_pages; i++)
  387. entries.append(PageOffsetHintTableEntry {});
  388. auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
  389. auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
  390. auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
  391. auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
  392. auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
  393. auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
  394. auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
  395. auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
  396. if (bit_size <= 0)
  397. return;
  398. for (int i = 0; i < number_of_pages; i++) {
  399. auto& entry = entries[i];
  400. entry.*field = bit_stream.read_bits(bit_size);
  401. }
  402. };
  403. auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
  404. if (bit_size <= 0)
  405. return;
  406. for (int page = 1; page < number_of_pages; page++) {
  407. auto number_of_shared_objects = entries[page].number_of_shared_objects;
  408. Vector<u32> items;
  409. items.ensure_capacity(number_of_shared_objects);
  410. for (size_t i = 0; i < number_of_shared_objects; i++)
  411. items.unchecked_append(bit_stream.read_bits(bit_size));
  412. entries[page].*field = move(items);
  413. }
  414. };
  415. parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
  416. parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
  417. parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
  418. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
  419. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
  420. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
  421. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
  422. return entries;
  423. }
  424. bool Parser::navigate_to_before_eof_marker()
  425. {
  426. m_reader.set_reading_backwards();
  427. while (!m_reader.done()) {
  428. m_reader.move_until([&](auto) { return matches_eol(); });
  429. if (m_reader.done())
  430. return false;
  431. consume_eol();
  432. if (!m_reader.matches("%%EOF"))
  433. continue;
  434. m_reader.move_by(5);
  435. if (!matches_eol())
  436. continue;
  437. consume_eol();
  438. return true;
  439. }
  440. return false;
  441. }
  442. bool Parser::navigate_to_after_startxref()
  443. {
  444. m_reader.set_reading_backwards();
  445. while (!m_reader.done()) {
  446. m_reader.move_until([&](auto) { return matches_eol(); });
  447. auto offset = m_reader.offset() + 1;
  448. consume_eol();
  449. if (!m_reader.matches("startxref"))
  450. continue;
  451. m_reader.move_by(9);
  452. if (!matches_eol())
  453. continue;
  454. m_reader.move_to(offset);
  455. return true;
  456. }
  457. return false;
  458. }
  459. String Parser::parse_comment()
  460. {
  461. if (!m_reader.matches('%'))
  462. return {};
  463. consume();
  464. auto comment_start_offset = m_reader.offset();
  465. m_reader.move_until([&](auto) {
  466. return matches_eol();
  467. });
  468. String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
  469. consume_eol();
  470. consume_whitespace();
  471. return str;
  472. }
  473. Value Parser::parse_value()
  474. {
  475. parse_comment();
  476. if (m_reader.matches("null")) {
  477. m_reader.move_by(4);
  478. consume_whitespace();
  479. return Value(nullptr);
  480. }
  481. if (m_reader.matches("true")) {
  482. m_reader.move_by(4);
  483. consume_whitespace();
  484. return Value(true);
  485. }
  486. if (m_reader.matches("false")) {
  487. m_reader.move_by(5);
  488. consume_whitespace();
  489. return Value(false);
  490. }
  491. if (matches_number())
  492. return parse_possible_indirect_value_or_ref();
  493. if (m_reader.matches('/'))
  494. return parse_name();
  495. if (m_reader.matches("<<")) {
  496. auto dict = parse_dict();
  497. if (!dict)
  498. return {};
  499. if (m_reader.matches("stream"))
  500. return parse_stream(dict.release_nonnull());
  501. return dict;
  502. }
  503. if (m_reader.matches_any('(', '<'))
  504. return parse_string();
  505. if (m_reader.matches('['))
  506. return parse_array();
  507. dbgln("tried to parse value, but found char {} ({}) at offset {}", m_reader.peek(), static_cast<u8>(m_reader.peek()), m_reader.offset());
  508. VERIFY_NOT_REACHED();
  509. }
  510. Value Parser::parse_possible_indirect_value_or_ref()
  511. {
  512. auto first_number = parse_number();
  513. if (!first_number.has<int>() || !matches_number())
  514. return first_number;
  515. m_reader.save();
  516. auto second_number = parse_number();
  517. if (!second_number.has<int>()) {
  518. m_reader.load();
  519. return first_number;
  520. }
  521. if (m_reader.matches('R')) {
  522. m_reader.discard();
  523. consume();
  524. consume_whitespace();
  525. return Value(Reference(first_number.get<int>(), second_number.get<int>()));
  526. }
  527. if (m_reader.matches("obj")) {
  528. m_reader.discard();
  529. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  530. }
  531. m_reader.load();
  532. return first_number;
  533. }
  534. RefPtr<IndirectValue> Parser::parse_indirect_value(int index, int generation)
  535. {
  536. if (!m_reader.matches("obj"))
  537. return {};
  538. m_reader.move_by(3);
  539. if (matches_eol())
  540. consume_eol();
  541. auto value = parse_value();
  542. if (!m_reader.matches("endobj"))
  543. return {};
  544. consume(6);
  545. consume_whitespace();
  546. return make_object<IndirectValue>(index, generation, value);
  547. }
  548. RefPtr<IndirectValue> Parser::parse_indirect_value()
  549. {
  550. auto first_number = parse_number();
  551. if (!first_number.has<int>())
  552. return {};
  553. auto second_number = parse_number();
  554. if (!second_number.has<int>())
  555. return {};
  556. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  557. }
  558. Value Parser::parse_number()
  559. {
  560. size_t start_offset = m_reader.offset();
  561. bool is_float = false;
  562. if (m_reader.matches('+') || m_reader.matches('-'))
  563. consume();
  564. while (!m_reader.done()) {
  565. if (m_reader.matches('.')) {
  566. if (is_float)
  567. break;
  568. is_float = true;
  569. consume();
  570. } else if (isdigit(m_reader.peek())) {
  571. consume();
  572. } else {
  573. break;
  574. }
  575. }
  576. consume_whitespace();
  577. auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
  578. float f = strtof(string.characters(), nullptr);
  579. if (is_float)
  580. return Value(f);
  581. VERIFY(floorf(f) == f);
  582. return Value(static_cast<int>(f));
  583. }
  584. RefPtr<NameObject> Parser::parse_name()
  585. {
  586. if (!consume('/'))
  587. return {};
  588. StringBuilder builder;
  589. while (true) {
  590. if (!matches_regular_character())
  591. break;
  592. if (m_reader.matches('#')) {
  593. int hex_value = 0;
  594. for (int i = 0; i < 2; i++) {
  595. auto ch = consume();
  596. if (!isxdigit(ch))
  597. return {};
  598. hex_value *= 16;
  599. if (ch <= '9') {
  600. hex_value += ch - '0';
  601. } else {
  602. hex_value += ch - 'A' + 10;
  603. }
  604. }
  605. builder.append(static_cast<char>(hex_value));
  606. continue;
  607. }
  608. builder.append(consume());
  609. }
  610. consume_whitespace();
  611. return make_object<NameObject>(builder.to_string());
  612. }
  613. RefPtr<StringObject> Parser::parse_string()
  614. {
  615. ScopeGuard guard([&] { consume_whitespace(); });
  616. String string;
  617. bool is_binary_string;
  618. if (m_reader.matches('(')) {
  619. string = parse_literal_string();
  620. is_binary_string = false;
  621. } else {
  622. string = parse_hex_string();
  623. is_binary_string = true;
  624. }
  625. if (string.is_null())
  626. return {};
  627. if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
  628. // The string is encoded in UTF16-BE
  629. string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
  630. } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
  631. // The string is encoded in UTF-8. This is the default anyways, but if these bytes
  632. // are explicitly included, we have to trim them
  633. string = string.substring(3);
  634. }
  635. return make_object<StringObject>(string, is_binary_string);
  636. }
  637. String Parser::parse_literal_string()
  638. {
  639. if (!consume('('))
  640. return {};
  641. StringBuilder builder;
  642. auto opened_parens = 0;
  643. while (true) {
  644. if (m_reader.matches('(')) {
  645. opened_parens++;
  646. builder.append(consume());
  647. } else if (m_reader.matches(')')) {
  648. consume();
  649. if (opened_parens == 0)
  650. break;
  651. opened_parens--;
  652. builder.append(')');
  653. } else if (m_reader.matches('\\')) {
  654. consume();
  655. if (matches_eol()) {
  656. consume_eol();
  657. continue;
  658. }
  659. if (m_reader.done())
  660. return {};
  661. auto ch = consume();
  662. switch (ch) {
  663. case 'n':
  664. builder.append('\n');
  665. break;
  666. case 'r':
  667. builder.append('\r');
  668. break;
  669. case 't':
  670. builder.append('\t');
  671. break;
  672. case 'b':
  673. builder.append('\b');
  674. break;
  675. case 'f':
  676. builder.append('\f');
  677. break;
  678. case '(':
  679. builder.append('(');
  680. break;
  681. case ')':
  682. builder.append(')');
  683. break;
  684. case '\\':
  685. builder.append('\\');
  686. break;
  687. default: {
  688. if (ch >= '0' && ch <= '7') {
  689. int octal_value = ch - '0';
  690. for (int i = 0; i < 2; i++) {
  691. auto octal_ch = consume();
  692. if (octal_ch < '0' || octal_ch > '7')
  693. break;
  694. octal_value = octal_value * 8 + (octal_ch - '0');
  695. }
  696. builder.append(static_cast<char>(octal_value));
  697. } else {
  698. builder.append(ch);
  699. }
  700. }
  701. }
  702. } else if (matches_eol()) {
  703. consume_eol();
  704. builder.append('\n');
  705. } else {
  706. builder.append(consume());
  707. }
  708. }
  709. if (opened_parens != 0)
  710. return {};
  711. return builder.to_string();
  712. }
  713. String Parser::parse_hex_string()
  714. {
  715. if (!consume('<'))
  716. return {};
  717. StringBuilder builder;
  718. while (true) {
  719. if (m_reader.matches('>')) {
  720. consume();
  721. return builder.to_string();
  722. } else {
  723. int hex_value = 0;
  724. for (int i = 0; i < 2; i++) {
  725. auto ch = consume();
  726. if (ch == '>') {
  727. // The hex string contains an odd number of characters, and the last character
  728. // is assumed to be '0'
  729. consume();
  730. hex_value *= 16;
  731. builder.append(static_cast<char>(hex_value));
  732. return builder.to_string();
  733. }
  734. if (!isxdigit(ch))
  735. return {};
  736. hex_value *= 16;
  737. if (ch <= '9') {
  738. hex_value += ch - '0';
  739. } else {
  740. hex_value += ch - 'A' + 10;
  741. }
  742. }
  743. builder.append(static_cast<char>(hex_value));
  744. }
  745. }
  746. }
  747. RefPtr<ArrayObject> Parser::parse_array()
  748. {
  749. if (!consume('['))
  750. return {};
  751. consume_whitespace();
  752. Vector<Value> values;
  753. while (!m_reader.matches(']')) {
  754. auto value = parse_value();
  755. if (value.has<Empty>())
  756. return {};
  757. values.append(value);
  758. }
  759. if (!consume(']'))
  760. return {};
  761. consume_whitespace();
  762. return make_object<ArrayObject>(values);
  763. }
  764. RefPtr<DictObject> Parser::parse_dict()
  765. {
  766. if (!consume('<') || !consume('<'))
  767. return {};
  768. consume_whitespace();
  769. HashMap<FlyString, Value> map;
  770. while (true) {
  771. if (m_reader.matches(">>"))
  772. break;
  773. auto name = parse_name();
  774. if (!name)
  775. return {};
  776. auto value = parse_value();
  777. if (value.has<Empty>())
  778. return {};
  779. map.set(name->name(), value);
  780. }
  781. if (!consume('>') || !consume('>'))
  782. return {};
  783. consume_whitespace();
  784. return make_object<DictObject>(map);
  785. }
  786. RefPtr<DictObject> Parser::conditionally_parse_page_tree_node(u32 object_index, bool& ok)
  787. {
  788. ok = true;
  789. VERIFY(m_xref_table->has_object(object_index));
  790. auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
  791. m_reader.move_to(byte_offset);
  792. parse_number();
  793. parse_number();
  794. if (!m_reader.matches("obj")) {
  795. ok = false;
  796. return {};
  797. }
  798. m_reader.move_by(3);
  799. consume_whitespace();
  800. if (!consume('<') || !consume('<'))
  801. return {};
  802. consume_whitespace();
  803. HashMap<FlyString, Value> map;
  804. while (true) {
  805. if (m_reader.matches(">>"))
  806. break;
  807. auto name = parse_name();
  808. if (!name) {
  809. ok = false;
  810. return {};
  811. }
  812. auto name_string = name->name();
  813. if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
  814. // This is a page, not a page tree node
  815. return {};
  816. }
  817. auto value = parse_value();
  818. if (value.has<Empty>()) {
  819. ok = false;
  820. return {};
  821. }
  822. if (name_string == CommonNames::Type) {
  823. if (!value.has<NonnullRefPtr<Object>>())
  824. return {};
  825. auto type_object = value.get<NonnullRefPtr<Object>>();
  826. if (!type_object->is_name())
  827. return {};
  828. auto type_name = object_cast<NameObject>(type_object);
  829. if (type_name->name() != CommonNames::Pages)
  830. return {};
  831. }
  832. map.set(name->name(), value);
  833. }
  834. if (!consume('>') || !consume('>'))
  835. return {};
  836. consume_whitespace();
  837. return make_object<DictObject>(map);
  838. }
  839. RefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
  840. {
  841. if (!m_reader.matches("stream"))
  842. return {};
  843. m_reader.move_by(6);
  844. if (!consume_eol())
  845. return {};
  846. ReadonlyBytes bytes;
  847. auto maybe_length = dict->get(CommonNames::Length);
  848. if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) {
  849. // The PDF writer has kindly provided us with the direct length of the stream
  850. m_reader.save();
  851. auto length = m_document->resolve_to<int>(maybe_length.value());
  852. m_reader.load();
  853. bytes = m_reader.bytes().slice(m_reader.offset(), length);
  854. m_reader.move_by(length);
  855. consume_whitespace();
  856. } else {
  857. // We have to look for the endstream keyword
  858. auto stream_start = m_reader.offset();
  859. while (true) {
  860. m_reader.move_until([&](auto) { return matches_eol(); });
  861. auto potential_stream_end = m_reader.offset();
  862. consume_eol();
  863. if (!m_reader.matches("endstream"))
  864. continue;
  865. bytes = m_reader.bytes().slice(stream_start, potential_stream_end - stream_start);
  866. break;
  867. }
  868. }
  869. m_reader.move_by(9);
  870. consume_whitespace();
  871. if (dict->contains(CommonNames::Filter)) {
  872. auto filter_type = dict->get_name(m_document, CommonNames::Filter)->name();
  873. auto maybe_bytes = Filter::decode(bytes, filter_type);
  874. if (maybe_bytes.is_error()) {
  875. warnln("Failed to decode filter: {}", maybe_bytes.error().string_literal());
  876. return {};
  877. }
  878. return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
  879. }
  880. return make_object<PlainTextStreamObject>(dict, bytes);
  881. }
  882. Vector<Command> Parser::parse_graphics_commands()
  883. {
  884. Vector<Command> commands;
  885. Vector<Value> command_args;
  886. constexpr static auto is_command_char = [](char ch) {
  887. return isalpha(ch) || ch == '*' || ch == '\'';
  888. };
  889. while (!m_reader.done()) {
  890. auto ch = m_reader.peek();
  891. if (is_command_char(ch)) {
  892. auto command_start = m_reader.offset();
  893. while (is_command_char(ch)) {
  894. consume();
  895. if (m_reader.done())
  896. break;
  897. ch = m_reader.peek();
  898. }
  899. auto command_string = StringView(m_reader.bytes().slice(command_start, m_reader.offset() - command_start));
  900. auto command_type = Command::command_type_from_symbol(command_string);
  901. commands.append(Command(command_type, move(command_args)));
  902. command_args = Vector<Value>();
  903. consume_whitespace();
  904. continue;
  905. }
  906. command_args.append(parse_value());
  907. }
  908. return commands;
  909. }
  910. bool Parser::matches_eol() const
  911. {
  912. return m_reader.matches_any(0xa, 0xd);
  913. }
  914. bool Parser::matches_whitespace() const
  915. {
  916. return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
  917. }
  918. bool Parser::matches_number() const
  919. {
  920. if (m_reader.done())
  921. return false;
  922. auto ch = m_reader.peek();
  923. return isdigit(ch) || ch == '-' || ch == '+';
  924. }
  925. bool Parser::matches_delimiter() const
  926. {
  927. return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%');
  928. }
  929. bool Parser::matches_regular_character() const
  930. {
  931. return !matches_delimiter() && !matches_whitespace();
  932. }
  933. bool Parser::consume_eol()
  934. {
  935. if (m_reader.done()) {
  936. return false;
  937. }
  938. if (m_reader.matches("\r\n")) {
  939. consume(2);
  940. return true;
  941. }
  942. auto consumed = consume();
  943. return consumed == 0xd || consumed == 0xa;
  944. }
  945. bool Parser::consume_whitespace()
  946. {
  947. bool consumed = false;
  948. while (matches_whitespace()) {
  949. consumed = true;
  950. consume();
  951. }
  952. return consumed;
  953. }
  954. char Parser::consume()
  955. {
  956. return m_reader.read();
  957. }
  958. void Parser::consume(int amount)
  959. {
  960. for (size_t i = 0; i < static_cast<size_t>(amount); i++)
  961. consume();
  962. }
  963. bool Parser::consume(char ch)
  964. {
  965. return consume() == ch;
  966. }
  967. }
  968. namespace AK {
  969. template<>
  970. struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
  971. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict)
  972. {
  973. StringBuilder builder;
  974. builder.append("{\n");
  975. builder.appendff(" length_of_file={}\n", dict.length_of_file);
  976. builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
  977. builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
  978. builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
  979. builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
  980. builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
  981. builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
  982. builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
  983. builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
  984. builder.appendff(" first_page={}\n", dict.first_page);
  985. builder.append('}');
  986. return Formatter<StringView>::format(format_builder, builder.to_string());
  987. }
  988. };
  989. template<>
  990. struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
  991. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table)
  992. {
  993. StringBuilder builder;
  994. builder.append("{\n");
  995. builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
  996. builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
  997. builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
  998. builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
  999. builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
  1000. builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
  1001. builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
  1002. builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
  1003. builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
  1004. builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
  1005. builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
  1006. builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
  1007. builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
  1008. builder.append('}');
  1009. return Formatter<StringView>::format(format_builder, builder.to_string());
  1010. }
  1011. };
  1012. template<>
  1013. struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
  1014. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry)
  1015. {
  1016. StringBuilder builder;
  1017. builder.append("{\n");
  1018. builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
  1019. builder.appendff(" page_length_number={}\n", entry.page_length_number);
  1020. builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
  1021. builder.append(" shared_object_identifiers=[");
  1022. for (auto& identifier : entry.shared_object_identifiers)
  1023. builder.appendff(" {}", identifier);
  1024. builder.append(" ]\n");
  1025. builder.append(" shared_object_location_numerators=[");
  1026. for (auto& numerator : entry.shared_object_location_numerators)
  1027. builder.appendff(" {}", numerator);
  1028. builder.append(" ]\n");
  1029. builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
  1030. builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
  1031. builder.append('}');
  1032. return Formatter<StringView>::format(format_builder, builder.to_string());
  1033. }
  1034. };
  1035. }