Parser.cpp 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189
  1. /*
  2. * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BitStream.h>
  7. #include <AK/MemoryStream.h>
  8. #include <AK/ScopeGuard.h>
  9. #include <LibPDF/CommonNames.h>
  10. #include <LibPDF/Document.h>
  11. #include <LibPDF/Filter.h>
  12. #include <LibPDF/Parser.h>
  13. #include <LibTextCodec/Decoder.h>
  14. #include <ctype.h>
  15. namespace PDF {
  16. template<typename T, typename... Args>
  17. static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
  18. {
  19. return adopt_ref(*new T(forward<Args>(args)...));
  20. }
  21. PDFErrorOr<Vector<Command>> Parser::parse_graphics_commands(ReadonlyBytes bytes)
  22. {
  23. auto parser = adopt_ref(*new Parser(bytes));
  24. return parser->parse_graphics_commands();
  25. }
  26. Parser::Parser(Badge<Document>, ReadonlyBytes bytes)
  27. : m_reader(bytes)
  28. {
  29. }
  30. Parser::Parser(ReadonlyBytes bytes)
  31. : m_reader(bytes)
  32. {
  33. }
  34. void Parser::set_document(WeakPtr<Document> const& document)
  35. {
  36. m_document = document;
  37. }
  38. PDFErrorOr<void> Parser::initialize()
  39. {
  40. TRY(parse_header());
  41. const auto linearization_result = TRY(initialize_linearization_dict());
  42. if (linearization_result == LinearizationResult::NotLinearized)
  43. return initialize_non_linearized_xref_table();
  44. bool is_linearized = m_linearization_dictionary.has_value();
  45. if (is_linearized) {
  46. // The file may have been linearized at one point, but could have been updated afterwards,
  47. // which means it is no longer a linearized PDF file.
  48. is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
  49. if (!is_linearized) {
  50. // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
  51. // split. This might take some tweaking to ensure correct behavior, which can be
  52. // implemented later.
  53. TODO();
  54. }
  55. }
  56. if (is_linearized)
  57. return initialize_linearized_xref_table();
  58. return initialize_non_linearized_xref_table();
  59. }
  60. PDFErrorOr<Value> Parser::parse_object_with_index(u32 index)
  61. {
  62. VERIFY(m_xref_table->has_object(index));
  63. auto byte_offset = m_xref_table->byte_offset_for_object(index);
  64. m_reader.move_to(byte_offset);
  65. auto indirect_value = TRY(parse_indirect_value());
  66. VERIFY(indirect_value->index() == index);
  67. return indirect_value->value();
  68. }
  69. PDFErrorOr<void> Parser::parse_header()
  70. {
  71. // FIXME: Do something with the version?
  72. m_reader.set_reading_forwards();
  73. if (m_reader.remaining() == 0)
  74. return error("Empty PDF document");
  75. m_reader.move_to(0);
  76. if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
  77. return error("Not a PDF document");
  78. m_reader.move_by(5);
  79. char major_ver = m_reader.read();
  80. if (major_ver != '1' && major_ver != '2')
  81. return error(String::formatted("Unknown major version \"{}\"", major_ver));
  82. if (m_reader.read() != '.')
  83. return error("Malformed PDF version");
  84. char minor_ver = m_reader.read();
  85. if (minor_ver < '0' || minor_ver > '7')
  86. return error(String::formatted("Unknown minor version \"{}\"", minor_ver));
  87. consume_eol();
  88. // Parse optional high-byte comment, which signifies a binary file
  89. // FIXME: Do something with this?
  90. auto comment = parse_comment();
  91. if (!comment.is_empty()) {
  92. auto binary = comment.length() >= 4;
  93. if (binary) {
  94. for (size_t i = 0; i < comment.length() && binary; i++)
  95. binary = static_cast<u8>(comment[i]) > 128;
  96. }
  97. }
  98. return {};
  99. }
  100. PDFErrorOr<Parser::LinearizationResult> Parser::initialize_linearization_dict()
  101. {
  102. // parse_header() is called immediately before this, so we are at the right location
  103. auto indirect_value = Value(*TRY(parse_indirect_value()));
  104. auto dict_value = TRY(m_document->resolve(indirect_value));
  105. if (!dict_value.has<NonnullRefPtr<Object>>())
  106. return error("Expected linearization object to be a dictionary");
  107. auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
  108. if (!dict_object->is<DictObject>())
  109. return LinearizationResult::NotLinearized;
  110. auto dict = dict_object->cast<DictObject>();
  111. if (!dict->contains(CommonNames::Linearized))
  112. return LinearizationResult::NotLinearized;
  113. if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
  114. return error("Malformed linearization dictionary");
  115. auto length_of_file = dict->get_value(CommonNames::L);
  116. auto hint_table = dict->get_value(CommonNames::H);
  117. auto first_page_object_number = dict->get_value(CommonNames::O);
  118. auto offset_of_first_page_end = dict->get_value(CommonNames::E);
  119. auto number_of_pages = dict->get_value(CommonNames::N);
  120. auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
  121. auto first_page = dict->get(CommonNames::P).value_or({});
  122. // Validation
  123. if (!length_of_file.has_u32()
  124. || !hint_table.has<NonnullRefPtr<Object>>()
  125. || !first_page_object_number.has_u32()
  126. || !number_of_pages.has_u16()
  127. || !offset_of_main_xref_table.has_u32()
  128. || (!first_page.has<Empty>() && !first_page.has_u32())) {
  129. return error("Malformed linearization dictionary parameters");
  130. }
  131. auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>();
  132. auto hint_table_size = hint_table_array->size();
  133. if (hint_table_size != 2 && hint_table_size != 4)
  134. return error("Expected hint table to be of length 2 or 4");
  135. auto primary_hint_stream_offset = hint_table_array->at(0);
  136. auto primary_hint_stream_length = hint_table_array->at(1);
  137. Value overflow_hint_stream_offset;
  138. Value overflow_hint_stream_length;
  139. if (hint_table_size == 4) {
  140. overflow_hint_stream_offset = hint_table_array->at(2);
  141. overflow_hint_stream_length = hint_table_array->at(3);
  142. }
  143. if (!primary_hint_stream_offset.has_u32()
  144. || !primary_hint_stream_length.has_u32()
  145. || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
  146. || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
  147. return error("Malformed hint stream");
  148. }
  149. m_linearization_dictionary = LinearizationDictionary {
  150. length_of_file.get_u32(),
  151. primary_hint_stream_offset.get_u32(),
  152. primary_hint_stream_length.get_u32(),
  153. overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
  154. overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
  155. first_page_object_number.get_u32(),
  156. offset_of_first_page_end.get_u32(),
  157. number_of_pages.get_u16(),
  158. offset_of_main_xref_table.get_u32(),
  159. first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
  160. };
  161. return LinearizationResult::Linearized;
  162. }
  163. PDFErrorOr<void> Parser::initialize_linearized_xref_table()
  164. {
  165. // The linearization parameter dictionary has just been parsed, and the xref table
  166. // comes immediately after it. We are in the correct spot.
  167. m_xref_table = TRY(parse_xref_table());
  168. m_trailer = TRY(parse_file_trailer());
  169. // Also parse the main xref table and merge into the first-page xref table. Note
  170. // that we don't use the main xref table offset from the linearization dict because
  171. // for some reason, it specified the offset of the whitespace after the object
  172. // index start and length? So it's much easier to do it this way.
  173. auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
  174. m_reader.move_to(main_xref_table_offset);
  175. auto main_xref_table = TRY(parse_xref_table());
  176. TRY(m_xref_table->merge(move(*main_xref_table)));
  177. return {};
  178. }
  179. PDFErrorOr<void> Parser::initialize_hint_tables()
  180. {
  181. auto linearization_dict = m_linearization_dictionary.value();
  182. auto primary_offset = linearization_dict.primary_hint_stream_offset;
  183. auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
  184. auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
  185. m_reader.move_to(offset);
  186. auto stream_indirect_value = parse_indirect_value();
  187. if (stream_indirect_value.is_error())
  188. return {};
  189. auto stream_value = stream_indirect_value.value()->value();
  190. if (!stream_value.has<NonnullRefPtr<Object>>())
  191. return {};
  192. auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
  193. if (!stream_object->is<StreamObject>())
  194. return {};
  195. return stream_object->cast<StreamObject>();
  196. };
  197. auto primary_hint_stream = parse_hint_table(primary_offset);
  198. if (!primary_hint_stream)
  199. return error("Invalid primary hint stream");
  200. RefPtr<StreamObject> overflow_hint_stream;
  201. if (overflow_offset != NumericLimits<u32>::max())
  202. overflow_hint_stream = parse_hint_table(overflow_offset);
  203. ByteBuffer possible_merged_stream_buffer;
  204. ReadonlyBytes hint_stream_bytes;
  205. if (overflow_hint_stream) {
  206. auto primary_size = primary_hint_stream->bytes().size();
  207. auto overflow_size = overflow_hint_stream->bytes().size();
  208. auto total_size = primary_size + overflow_size;
  209. auto buffer_result = ByteBuffer::create_uninitialized(total_size);
  210. if (buffer_result.is_error())
  211. return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" };
  212. possible_merged_stream_buffer = buffer_result.release_value();
  213. MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()));
  214. MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()));
  215. hint_stream_bytes = possible_merged_stream_buffer.bytes();
  216. } else {
  217. hint_stream_bytes = primary_hint_stream->bytes();
  218. }
  219. auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes));
  220. auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes);
  221. // FIXME: Do something with the hint tables
  222. return {};
  223. }
  224. PDFErrorOr<void> Parser::initialize_non_linearized_xref_table()
  225. {
  226. m_reader.move_to(m_reader.bytes().size() - 1);
  227. if (!navigate_to_before_eof_marker())
  228. return error("No EOF marker");
  229. if (!navigate_to_after_startxref())
  230. return error("No xref");
  231. m_reader.set_reading_forwards();
  232. auto xref_offset_value = parse_number();
  233. if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
  234. return error("Invalid xref offset");
  235. auto xref_offset = xref_offset_value.value().get<int>();
  236. m_reader.move_to(xref_offset);
  237. m_xref_table = TRY(parse_xref_table());
  238. m_trailer = TRY(parse_file_trailer());
  239. return {};
  240. }
  241. PDFErrorOr<NonnullRefPtr<XRefTable>> Parser::parse_xref_table()
  242. {
  243. if (!m_reader.matches("xref"))
  244. return error("Expected \"xref\"");
  245. m_reader.move_by(4);
  246. if (!consume_eol())
  247. return error("Expected newline after \"xref\"");
  248. auto table = adopt_ref(*new XRefTable());
  249. while (true) {
  250. if (m_reader.matches("trailer"))
  251. return table;
  252. Vector<XRefEntry> entries;
  253. auto starting_index_value = TRY(parse_number());
  254. auto starting_index = starting_index_value.get<int>();
  255. auto object_count_value = TRY(parse_number());
  256. auto object_count = object_count_value.get<int>();
  257. for (int i = 0; i < object_count; i++) {
  258. auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
  259. m_reader.move_by(10);
  260. if (!consume(' '))
  261. return error("Malformed xref entry");
  262. auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
  263. m_reader.move_by(5);
  264. if (!consume(' '))
  265. return error("Malformed xref entry");
  266. auto letter = m_reader.read();
  267. if (letter != 'n' && letter != 'f')
  268. return error("Malformed xref entry");
  269. // The line ending sequence can be one of the following:
  270. // SP CR, SP LF, or CR LF
  271. if (m_reader.matches(' ')) {
  272. consume();
  273. auto ch = consume();
  274. if (ch != '\r' && ch != '\n')
  275. return error("Malformed xref entry");
  276. } else {
  277. if (!m_reader.matches("\r\n"))
  278. return error("Malformed xref entry");
  279. m_reader.move_by(2);
  280. }
  281. auto offset = strtol(offset_string.characters(), nullptr, 10);
  282. auto generation = strtol(generation_string.characters(), nullptr, 10);
  283. entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
  284. }
  285. table->add_section({ starting_index, object_count, entries });
  286. }
  287. }
  288. PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_file_trailer()
  289. {
  290. if (!m_reader.matches("trailer"))
  291. return error("Expected \"trailer\" keyword");
  292. m_reader.move_by(7);
  293. consume_whitespace();
  294. auto dict = TRY(parse_dict());
  295. if (!m_reader.matches("startxref"))
  296. return error("Expected \"startxref\"");
  297. m_reader.move_by(9);
  298. consume_whitespace();
  299. m_reader.move_until([&](auto) { return matches_eol(); });
  300. VERIFY(consume_eol());
  301. if (!m_reader.matches("%%EOF"))
  302. return error("Expected \"%%EOF\"");
  303. m_reader.move_by(5);
  304. consume_whitespace();
  305. return dict;
  306. }
  307. PDFErrorOr<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
  308. {
  309. if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
  310. return error("Hint stream is too small");
  311. size_t offset = 0;
  312. auto read_u32 = [&] {
  313. u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
  314. offset += 4;
  315. return AK::convert_between_host_and_big_endian(data);
  316. };
  317. auto read_u16 = [&] {
  318. u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
  319. offset += 2;
  320. return AK::convert_between_host_and_big_endian(data);
  321. };
  322. PageOffsetHintTable hint_table {
  323. read_u32(),
  324. read_u32(),
  325. read_u16(),
  326. read_u32(),
  327. read_u16(),
  328. read_u32(),
  329. read_u16(),
  330. read_u32(),
  331. read_u16(),
  332. read_u16(),
  333. read_u16(),
  334. read_u16(),
  335. read_u16(),
  336. };
  337. // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
  338. // fields in PageOffsetHintTableEntry are u32
  339. VERIFY(hint_table.bits_required_for_object_number <= 32);
  340. VERIFY(hint_table.bits_required_for_page_length <= 32);
  341. VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
  342. VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
  343. VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
  344. VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
  345. VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
  346. return hint_table;
  347. }
  348. Vector<Parser::PageOffsetHintTableEntry> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
  349. {
  350. InputMemoryStream input_stream(hint_stream_bytes);
  351. input_stream.seek(sizeof(PageOffsetHintTable));
  352. InputBitStream bit_stream(input_stream);
  353. auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
  354. Vector<PageOffsetHintTableEntry> entries;
  355. for (size_t i = 0; i < number_of_pages; i++)
  356. entries.append(PageOffsetHintTableEntry {});
  357. auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
  358. auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
  359. auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
  360. auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
  361. auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
  362. auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
  363. auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
  364. auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
  365. if (bit_size <= 0)
  366. return;
  367. for (int i = 0; i < number_of_pages; i++) {
  368. auto& entry = entries[i];
  369. entry.*field = bit_stream.read_bits(bit_size);
  370. }
  371. };
  372. auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
  373. if (bit_size <= 0)
  374. return;
  375. for (int page = 1; page < number_of_pages; page++) {
  376. auto number_of_shared_objects = entries[page].number_of_shared_objects;
  377. Vector<u32> items;
  378. items.ensure_capacity(number_of_shared_objects);
  379. for (size_t i = 0; i < number_of_shared_objects; i++)
  380. items.unchecked_append(bit_stream.read_bits(bit_size));
  381. entries[page].*field = move(items);
  382. }
  383. };
  384. parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
  385. parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
  386. parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
  387. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
  388. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
  389. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
  390. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
  391. return entries;
  392. }
  393. bool Parser::navigate_to_before_eof_marker()
  394. {
  395. m_reader.set_reading_backwards();
  396. while (!m_reader.done()) {
  397. m_reader.move_until([&](auto) { return matches_eol(); });
  398. if (m_reader.done())
  399. return false;
  400. consume_eol();
  401. if (!m_reader.matches("%%EOF"))
  402. continue;
  403. m_reader.move_by(5);
  404. if (!matches_eol())
  405. continue;
  406. consume_eol();
  407. return true;
  408. }
  409. return false;
  410. }
  411. bool Parser::navigate_to_after_startxref()
  412. {
  413. m_reader.set_reading_backwards();
  414. while (!m_reader.done()) {
  415. m_reader.move_until([&](auto) { return matches_eol(); });
  416. auto offset = m_reader.offset() + 1;
  417. consume_eol();
  418. if (!m_reader.matches("startxref"))
  419. continue;
  420. m_reader.move_by(9);
  421. if (!matches_eol())
  422. continue;
  423. m_reader.move_to(offset);
  424. return true;
  425. }
  426. return false;
  427. }
  428. String Parser::parse_comment()
  429. {
  430. if (!m_reader.matches('%'))
  431. return {};
  432. consume();
  433. auto comment_start_offset = m_reader.offset();
  434. m_reader.move_until([&](auto) {
  435. return matches_eol();
  436. });
  437. String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
  438. consume_eol();
  439. consume_whitespace();
  440. return str;
  441. }
  442. PDFErrorOr<Value> Parser::parse_value()
  443. {
  444. parse_comment();
  445. if (m_reader.matches("null")) {
  446. m_reader.move_by(4);
  447. consume_whitespace();
  448. return Value(nullptr);
  449. }
  450. if (m_reader.matches("true")) {
  451. m_reader.move_by(4);
  452. consume_whitespace();
  453. return Value(true);
  454. }
  455. if (m_reader.matches("false")) {
  456. m_reader.move_by(5);
  457. consume_whitespace();
  458. return Value(false);
  459. }
  460. if (matches_number())
  461. return parse_possible_indirect_value_or_ref();
  462. if (m_reader.matches('/'))
  463. return MUST(parse_name());
  464. if (m_reader.matches("<<")) {
  465. auto dict = TRY(parse_dict());
  466. if (m_reader.matches("stream"))
  467. return TRY(parse_stream(dict));
  468. return dict;
  469. }
  470. if (m_reader.matches_any('(', '<'))
  471. return parse_string();
  472. if (m_reader.matches('['))
  473. return TRY(parse_array());
  474. return error(String::formatted("Unexpected char \"{}\"", m_reader.peek()));
  475. }
  476. PDFErrorOr<Value> Parser::parse_possible_indirect_value_or_ref()
  477. {
  478. auto first_number = TRY(parse_number());
  479. if (!matches_number())
  480. return first_number;
  481. m_reader.save();
  482. auto second_number = parse_number();
  483. if (second_number.is_error()) {
  484. m_reader.load();
  485. return first_number;
  486. }
  487. if (m_reader.matches('R')) {
  488. m_reader.discard();
  489. consume();
  490. consume_whitespace();
  491. return Value(Reference(first_number.get<int>(), second_number.value().get<int>()));
  492. }
  493. if (m_reader.matches("obj")) {
  494. m_reader.discard();
  495. return TRY(parse_indirect_value(first_number.get<int>(), second_number.value().get<int>()));
  496. }
  497. m_reader.load();
  498. return first_number;
  499. }
  500. PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value(int index, int generation)
  501. {
  502. if (!m_reader.matches("obj"))
  503. return error("Expected \"obj\" at beginning of indirect value");
  504. m_reader.move_by(3);
  505. if (matches_eol())
  506. consume_eol();
  507. auto value = TRY(parse_value());
  508. if (!m_reader.matches("endobj"))
  509. return error("Expected \"endobj\" at end of indirect value");
  510. consume(6);
  511. consume_whitespace();
  512. return make_object<IndirectValue>(index, generation, value);
  513. }
  514. PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value()
  515. {
  516. auto first_number = TRY(parse_number());
  517. auto second_number = TRY(parse_number());
  518. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  519. }
  520. PDFErrorOr<Value> Parser::parse_number()
  521. {
  522. size_t start_offset = m_reader.offset();
  523. bool is_float = false;
  524. bool consumed_digit = false;
  525. if (m_reader.matches('+') || m_reader.matches('-'))
  526. consume();
  527. while (!m_reader.done()) {
  528. if (m_reader.matches('.')) {
  529. if (is_float)
  530. break;
  531. is_float = true;
  532. consume();
  533. } else if (isdigit(m_reader.peek())) {
  534. consume();
  535. consumed_digit = true;
  536. } else {
  537. break;
  538. }
  539. }
  540. if (!consumed_digit)
  541. return error("Invalid number");
  542. consume_whitespace();
  543. auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
  544. float f = strtof(string.characters(), nullptr);
  545. if (is_float)
  546. return Value(f);
  547. VERIFY(floorf(f) == f);
  548. return Value(static_cast<int>(f));
  549. }
  550. PDFErrorOr<NonnullRefPtr<NameObject>> Parser::parse_name()
  551. {
  552. if (!consume('/'))
  553. return error("Expected Name object to start with \"/\"");
  554. StringBuilder builder;
  555. while (true) {
  556. if (!matches_regular_character())
  557. break;
  558. if (m_reader.matches('#')) {
  559. int hex_value = 0;
  560. for (int i = 0; i < 2; i++) {
  561. auto ch = consume();
  562. VERIFY(isxdigit(ch));
  563. hex_value *= 16;
  564. if (ch <= '9') {
  565. hex_value += ch - '0';
  566. } else {
  567. hex_value += ch - 'A' + 10;
  568. }
  569. }
  570. builder.append(static_cast<char>(hex_value));
  571. continue;
  572. }
  573. builder.append(consume());
  574. }
  575. consume_whitespace();
  576. return make_object<NameObject>(builder.to_string());
  577. }
  578. NonnullRefPtr<StringObject> Parser::parse_string()
  579. {
  580. ScopeGuard guard([&] { consume_whitespace(); });
  581. String string;
  582. bool is_binary_string;
  583. if (m_reader.matches('(')) {
  584. string = parse_literal_string();
  585. is_binary_string = false;
  586. } else {
  587. string = parse_hex_string();
  588. is_binary_string = true;
  589. }
  590. VERIFY(!string.is_null());
  591. if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
  592. // The string is encoded in UTF16-BE
  593. string = TextCodec::decoder_for("utf-16be")->to_utf8(string);
  594. } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
  595. // The string is encoded in UTF-8. This is the default anyways, but if these bytes
  596. // are explicitly included, we have to trim them
  597. string = string.substring(3);
  598. }
  599. return make_object<StringObject>(string, is_binary_string);
  600. }
  601. String Parser::parse_literal_string()
  602. {
  603. VERIFY(consume('('));
  604. StringBuilder builder;
  605. auto opened_parens = 0;
  606. while (true) {
  607. if (m_reader.matches('(')) {
  608. opened_parens++;
  609. builder.append(consume());
  610. } else if (m_reader.matches(')')) {
  611. consume();
  612. if (opened_parens == 0)
  613. break;
  614. opened_parens--;
  615. builder.append(')');
  616. } else if (m_reader.matches('\\')) {
  617. consume();
  618. if (matches_eol()) {
  619. consume_eol();
  620. continue;
  621. }
  622. if (m_reader.done())
  623. return {};
  624. auto ch = consume();
  625. switch (ch) {
  626. case 'n':
  627. builder.append('\n');
  628. break;
  629. case 'r':
  630. builder.append('\r');
  631. break;
  632. case 't':
  633. builder.append('\t');
  634. break;
  635. case 'b':
  636. builder.append('\b');
  637. break;
  638. case 'f':
  639. builder.append('\f');
  640. break;
  641. case '(':
  642. builder.append('(');
  643. break;
  644. case ')':
  645. builder.append(')');
  646. break;
  647. case '\\':
  648. builder.append('\\');
  649. break;
  650. default: {
  651. if (ch >= '0' && ch <= '7') {
  652. int octal_value = ch - '0';
  653. for (int i = 0; i < 2; i++) {
  654. auto octal_ch = consume();
  655. if (octal_ch < '0' || octal_ch > '7')
  656. break;
  657. octal_value = octal_value * 8 + (octal_ch - '0');
  658. }
  659. builder.append(static_cast<char>(octal_value));
  660. } else {
  661. builder.append(ch);
  662. }
  663. }
  664. }
  665. } else if (matches_eol()) {
  666. consume_eol();
  667. builder.append('\n');
  668. } else {
  669. builder.append(consume());
  670. }
  671. }
  672. return builder.to_string();
  673. }
  674. String Parser::parse_hex_string()
  675. {
  676. VERIFY(consume('<'));
  677. StringBuilder builder;
  678. while (true) {
  679. if (m_reader.matches('>')) {
  680. consume();
  681. return builder.to_string();
  682. } else {
  683. int hex_value = 0;
  684. for (int i = 0; i < 2; i++) {
  685. auto ch = consume();
  686. if (ch == '>') {
  687. // The hex string contains an odd number of characters, and the last character
  688. // is assumed to be '0'
  689. consume();
  690. hex_value *= 16;
  691. builder.append(static_cast<char>(hex_value));
  692. return builder.to_string();
  693. }
  694. VERIFY(isxdigit(ch));
  695. hex_value *= 16;
  696. if (ch <= '9') {
  697. hex_value += ch - '0';
  698. } else {
  699. hex_value += ch - 'A' + 10;
  700. }
  701. }
  702. builder.append(static_cast<char>(hex_value));
  703. }
  704. }
  705. }
  706. PDFErrorOr<NonnullRefPtr<ArrayObject>> Parser::parse_array()
  707. {
  708. if (!consume('['))
  709. return error("Expected array to start with \"[\"");
  710. consume_whitespace();
  711. Vector<Value> values;
  712. while (!m_reader.matches(']'))
  713. values.append(TRY(parse_value()));
  714. VERIFY(consume(']'));
  715. consume_whitespace();
  716. return make_object<ArrayObject>(values);
  717. }
  718. PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_dict()
  719. {
  720. if (!consume('<') || !consume('<'))
  721. return error("Expected dict to start with \"<<\"");
  722. consume_whitespace();
  723. HashMap<FlyString, Value> map;
  724. while (!m_reader.done()) {
  725. if (m_reader.matches(">>"))
  726. break;
  727. auto name = TRY(parse_name())->name();
  728. auto value = TRY(parse_value());
  729. map.set(name, value);
  730. }
  731. if (!consume('>') || !consume('>'))
  732. return error("Expected dict to end with \">>\"");
  733. consume_whitespace();
  734. return make_object<DictObject>(map);
  735. }
  736. PDFErrorOr<RefPtr<DictObject>> Parser::conditionally_parse_page_tree_node(u32 object_index)
  737. {
  738. VERIFY(m_xref_table->has_object(object_index));
  739. auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
  740. m_reader.move_to(byte_offset);
  741. TRY(parse_number());
  742. TRY(parse_number());
  743. if (!m_reader.matches("obj"))
  744. return error(String::formatted("Invalid page tree offset {}", object_index));
  745. m_reader.move_by(3);
  746. consume_whitespace();
  747. VERIFY(consume('<') && consume('<'));
  748. consume_whitespace();
  749. HashMap<FlyString, Value> map;
  750. while (true) {
  751. if (m_reader.matches(">>"))
  752. break;
  753. auto name = TRY(parse_name());
  754. auto name_string = name->name();
  755. if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
  756. // This is a page, not a page tree node
  757. return RefPtr<DictObject> {};
  758. }
  759. auto value = TRY(parse_value());
  760. if (name_string == CommonNames::Type) {
  761. if (!value.has<NonnullRefPtr<Object>>())
  762. return RefPtr<DictObject> {};
  763. auto type_object = value.get<NonnullRefPtr<Object>>();
  764. if (!type_object->is<NameObject>())
  765. return RefPtr<DictObject> {};
  766. auto type_name = type_object->cast<NameObject>();
  767. if (type_name->name() != CommonNames::Pages)
  768. return RefPtr<DictObject> {};
  769. }
  770. map.set(name->name(), value);
  771. }
  772. VERIFY(consume('>') && consume('>'));
  773. consume_whitespace();
  774. return make_object<DictObject>(map);
  775. }
  776. PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
  777. {
  778. if (!m_reader.matches("stream"))
  779. return error("Expected stream to start with \"stream\"");
  780. m_reader.move_by(6);
  781. if (!consume_eol())
  782. return error("Expected \"stream\" to be followed by a newline");
  783. ReadonlyBytes bytes;
  784. auto maybe_length = dict->get(CommonNames::Length);
  785. if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) {
  786. // The PDF writer has kindly provided us with the direct length of the stream
  787. m_reader.save();
  788. auto length = TRY(m_document->resolve_to<int>(maybe_length.value()));
  789. m_reader.load();
  790. bytes = m_reader.bytes().slice(m_reader.offset(), length);
  791. m_reader.move_by(length);
  792. consume_whitespace();
  793. } else {
  794. // We have to look for the endstream keyword
  795. auto stream_start = m_reader.offset();
  796. while (true) {
  797. m_reader.move_until([&](auto) { return matches_eol(); });
  798. auto potential_stream_end = m_reader.offset();
  799. consume_eol();
  800. if (!m_reader.matches("endstream"))
  801. continue;
  802. bytes = m_reader.bytes().slice(stream_start, potential_stream_end - stream_start);
  803. break;
  804. }
  805. }
  806. m_reader.move_by(9);
  807. consume_whitespace();
  808. if (dict->contains(CommonNames::Filter)) {
  809. auto filter_type = MUST(dict->get_name(m_document, CommonNames::Filter))->name();
  810. auto maybe_bytes = Filter::decode(bytes, filter_type);
  811. if (maybe_bytes.is_error()) {
  812. warnln("Failed to decode filter: {}", maybe_bytes.error().string_literal());
  813. return error(String::formatted("Failed to decode filter {}", maybe_bytes.error().string_literal()));
  814. }
  815. return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
  816. }
  817. return make_object<PlainTextStreamObject>(dict, bytes);
  818. }
  819. PDFErrorOr<Vector<Command>> Parser::parse_graphics_commands()
  820. {
  821. Vector<Command> commands;
  822. Vector<Value> command_args;
  823. constexpr static auto is_command_char = [](char ch) {
  824. return isalpha(ch) || ch == '*' || ch == '\'';
  825. };
  826. while (!m_reader.done()) {
  827. auto ch = m_reader.peek();
  828. if (is_command_char(ch)) {
  829. auto command_start = m_reader.offset();
  830. while (is_command_char(ch)) {
  831. consume();
  832. if (m_reader.done())
  833. break;
  834. ch = m_reader.peek();
  835. }
  836. auto command_string = StringView(m_reader.bytes().slice(command_start, m_reader.offset() - command_start));
  837. auto command_type = Command::command_type_from_symbol(command_string);
  838. commands.append(Command(command_type, move(command_args)));
  839. command_args = Vector<Value>();
  840. consume_whitespace();
  841. continue;
  842. }
  843. command_args.append(TRY(parse_value()));
  844. }
  845. return commands;
  846. }
  847. bool Parser::matches_eol() const
  848. {
  849. return m_reader.matches_any(0xa, 0xd);
  850. }
  851. bool Parser::matches_whitespace() const
  852. {
  853. return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
  854. }
  855. bool Parser::matches_number() const
  856. {
  857. if (m_reader.done())
  858. return false;
  859. auto ch = m_reader.peek();
  860. return isdigit(ch) || ch == '-' || ch == '+';
  861. }
  862. bool Parser::matches_delimiter() const
  863. {
  864. return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%');
  865. }
  866. bool Parser::matches_regular_character() const
  867. {
  868. return !matches_delimiter() && !matches_whitespace();
  869. }
  870. bool Parser::consume_eol()
  871. {
  872. if (m_reader.done()) {
  873. return false;
  874. }
  875. if (m_reader.matches("\r\n")) {
  876. consume(2);
  877. return true;
  878. }
  879. auto consumed = consume();
  880. return consumed == 0xd || consumed == 0xa;
  881. }
  882. bool Parser::consume_whitespace()
  883. {
  884. bool consumed = false;
  885. while (matches_whitespace()) {
  886. consumed = true;
  887. consume();
  888. }
  889. return consumed;
  890. }
  891. char Parser::consume()
  892. {
  893. return m_reader.read();
  894. }
  895. void Parser::consume(int amount)
  896. {
  897. for (size_t i = 0; i < static_cast<size_t>(amount); i++)
  898. consume();
  899. }
  900. bool Parser::consume(char ch)
  901. {
  902. return consume() == ch;
  903. }
  904. Error Parser::error(
  905. String const& message
  906. #ifdef PDF_DEBUG
  907. ,
  908. SourceLocation loc
  909. #endif
  910. ) const
  911. {
  912. #ifdef PDF_DEBUG
  913. dbgln("\033[31m{} Parser error at offset {}: {}\033[0m", loc, m_reader.offset(), message);
  914. #endif
  915. return Error { Error::Type::Parse, message };
  916. }
  917. }
  918. namespace AK {
  919. template<>
  920. struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
  921. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict)
  922. {
  923. StringBuilder builder;
  924. builder.append("{\n");
  925. builder.appendff(" length_of_file={}\n", dict.length_of_file);
  926. builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
  927. builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
  928. builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
  929. builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
  930. builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
  931. builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
  932. builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
  933. builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
  934. builder.appendff(" first_page={}\n", dict.first_page);
  935. builder.append('}');
  936. return Formatter<StringView>::format(format_builder, builder.to_string());
  937. }
  938. };
  939. template<>
  940. struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
  941. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table)
  942. {
  943. StringBuilder builder;
  944. builder.append("{\n");
  945. builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
  946. builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
  947. builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
  948. builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
  949. builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
  950. builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
  951. builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
  952. builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
  953. builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
  954. builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
  955. builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
  956. builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
  957. builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
  958. builder.append('}');
  959. return Formatter<StringView>::format(format_builder, builder.to_string());
  960. }
  961. };
  962. template<>
  963. struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
  964. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry)
  965. {
  966. StringBuilder builder;
  967. builder.append("{\n");
  968. builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
  969. builder.appendff(" page_length_number={}\n", entry.page_length_number);
  970. builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
  971. builder.append(" shared_object_identifiers=[");
  972. for (auto& identifier : entry.shared_object_identifiers)
  973. builder.appendff(" {}", identifier);
  974. builder.append(" ]\n");
  975. builder.append(" shared_object_location_numerators=[");
  976. for (auto& numerator : entry.shared_object_location_numerators)
  977. builder.appendff(" {}", numerator);
  978. builder.append(" ]\n");
  979. builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
  980. builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
  981. builder.append('}');
  982. return Formatter<StringView>::format(format_builder, builder.to_string());
  983. }
  984. };
  985. }