Parser.cpp 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196
  1. /*
  2. * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BitStream.h>
  7. #include <AK/MemoryStream.h>
  8. #include <AK/ScopeGuard.h>
  9. #include <LibPDF/CommonNames.h>
  10. #include <LibPDF/Document.h>
  11. #include <LibPDF/Filter.h>
  12. #include <LibPDF/Parser.h>
  13. #include <LibTextCodec/Decoder.h>
  14. #include <ctype.h>
  15. namespace PDF {
  16. template<typename T, typename... Args>
  17. static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
  18. {
  19. return adopt_ref(*new T(forward<Args>(args)...));
  20. }
  21. PDFErrorOr<Vector<Command>> Parser::parse_graphics_commands(ReadonlyBytes bytes)
  22. {
  23. auto parser = adopt_ref(*new Parser(bytes));
  24. return parser->parse_graphics_commands();
  25. }
  26. Parser::Parser(Badge<Document>, ReadonlyBytes bytes)
  27. : m_reader(bytes)
  28. {
  29. }
  30. Parser::Parser(ReadonlyBytes bytes)
  31. : m_reader(bytes)
  32. {
  33. }
  34. void Parser::set_document(WeakPtr<Document> const& document)
  35. {
  36. m_document = document;
  37. }
  38. PDFErrorOr<void> Parser::initialize()
  39. {
  40. TRY(parse_header());
  41. const auto linearization_result = TRY(initialize_linearization_dict());
  42. if (linearization_result == LinearizationResult::NotLinearized)
  43. return initialize_non_linearized_xref_table();
  44. bool is_linearized = m_linearization_dictionary.has_value();
  45. if (is_linearized) {
  46. // The file may have been linearized at one point, but could have been updated afterwards,
  47. // which means it is no longer a linearized PDF file.
  48. is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
  49. if (!is_linearized) {
  50. // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
  51. // split. This might take some tweaking to ensure correct behavior, which can be
  52. // implemented later.
  53. TODO();
  54. }
  55. }
  56. if (is_linearized)
  57. return initialize_linearized_xref_table();
  58. return initialize_non_linearized_xref_table();
  59. }
  60. PDFErrorOr<Value> Parser::parse_object_with_index(u32 index)
  61. {
  62. VERIFY(m_xref_table->has_object(index));
  63. auto byte_offset = m_xref_table->byte_offset_for_object(index);
  64. m_reader.move_to(byte_offset);
  65. auto indirect_value = TRY(parse_indirect_value());
  66. VERIFY(indirect_value->index() == index);
  67. return indirect_value->value();
  68. }
  69. PDFErrorOr<void> Parser::parse_header()
  70. {
  71. // FIXME: Do something with the version?
  72. m_reader.set_reading_forwards();
  73. if (m_reader.remaining() == 0)
  74. return error("Empty PDF document");
  75. m_reader.move_to(0);
  76. if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
  77. return error("Not a PDF document");
  78. m_reader.move_by(5);
  79. char major_ver = m_reader.read();
  80. if (major_ver != '1' && major_ver != '2')
  81. return error(String::formatted("Unknown major version \"{}\"", major_ver));
  82. if (m_reader.read() != '.')
  83. return error("Malformed PDF version");
  84. char minor_ver = m_reader.read();
  85. if (minor_ver < '0' || minor_ver > '7')
  86. return error(String::formatted("Unknown minor version \"{}\"", minor_ver));
  87. consume_eol();
  88. // Parse optional high-byte comment, which signifies a binary file
  89. // FIXME: Do something with this?
  90. auto comment = parse_comment();
  91. if (!comment.is_empty()) {
  92. auto binary = comment.length() >= 4;
  93. if (binary) {
  94. for (size_t i = 0; i < comment.length() && binary; i++)
  95. binary = static_cast<u8>(comment[i]) > 128;
  96. }
  97. }
  98. return {};
  99. }
  100. PDFErrorOr<Parser::LinearizationResult> Parser::initialize_linearization_dict()
  101. {
  102. // parse_header() is called immediately before this, so we are at the right location
  103. auto indirect_value = Value(*TRY(parse_indirect_value()));
  104. auto dict_value = TRY(m_document->resolve(indirect_value));
  105. if (!dict_value.has<NonnullRefPtr<Object>>())
  106. return error("Expected linearization object to be a dictionary");
  107. auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
  108. if (!dict_object->is<DictObject>())
  109. return LinearizationResult::NotLinearized;
  110. auto dict = dict_object->cast<DictObject>();
  111. if (!dict->contains(CommonNames::Linearized))
  112. return LinearizationResult::NotLinearized;
  113. if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
  114. return error("Malformed linearization dictionary");
  115. auto length_of_file = dict->get_value(CommonNames::L);
  116. auto hint_table = dict->get_value(CommonNames::H);
  117. auto first_page_object_number = dict->get_value(CommonNames::O);
  118. auto offset_of_first_page_end = dict->get_value(CommonNames::E);
  119. auto number_of_pages = dict->get_value(CommonNames::N);
  120. auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
  121. auto first_page = dict->get(CommonNames::P).value_or({});
  122. // Validation
  123. if (!length_of_file.has_u32()
  124. || !hint_table.has<NonnullRefPtr<Object>>()
  125. || !first_page_object_number.has_u32()
  126. || !number_of_pages.has_u16()
  127. || !offset_of_main_xref_table.has_u32()
  128. || (!first_page.has<Empty>() && !first_page.has_u32())) {
  129. return error("Malformed linearization dictionary parameters");
  130. }
  131. auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>();
  132. auto hint_table_size = hint_table_array->size();
  133. if (hint_table_size != 2 && hint_table_size != 4)
  134. return error("Expected hint table to be of length 2 or 4");
  135. auto primary_hint_stream_offset = hint_table_array->at(0);
  136. auto primary_hint_stream_length = hint_table_array->at(1);
  137. Value overflow_hint_stream_offset;
  138. Value overflow_hint_stream_length;
  139. if (hint_table_size == 4) {
  140. overflow_hint_stream_offset = hint_table_array->at(2);
  141. overflow_hint_stream_length = hint_table_array->at(3);
  142. }
  143. if (!primary_hint_stream_offset.has_u32()
  144. || !primary_hint_stream_length.has_u32()
  145. || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
  146. || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
  147. return error("Malformed hint stream");
  148. }
  149. m_linearization_dictionary = LinearizationDictionary {
  150. length_of_file.get_u32(),
  151. primary_hint_stream_offset.get_u32(),
  152. primary_hint_stream_length.get_u32(),
  153. overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
  154. overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
  155. first_page_object_number.get_u32(),
  156. offset_of_first_page_end.get_u32(),
  157. number_of_pages.get_u16(),
  158. offset_of_main_xref_table.get_u32(),
  159. first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
  160. };
  161. return LinearizationResult::Linearized;
  162. }
  163. PDFErrorOr<void> Parser::initialize_linearized_xref_table()
  164. {
  165. // The linearization parameter dictionary has just been parsed, and the xref table
  166. // comes immediately after it. We are in the correct spot.
  167. m_xref_table = TRY(parse_xref_table());
  168. m_trailer = TRY(parse_file_trailer());
  169. // Also parse the main xref table and merge into the first-page xref table. Note
  170. // that we don't use the main xref table offset from the linearization dict because
  171. // for some reason, it specified the offset of the whitespace after the object
  172. // index start and length? So it's much easier to do it this way.
  173. auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
  174. m_reader.move_to(main_xref_table_offset);
  175. auto main_xref_table = TRY(parse_xref_table());
  176. TRY(m_xref_table->merge(move(*main_xref_table)));
  177. return {};
  178. }
  179. PDFErrorOr<void> Parser::initialize_hint_tables()
  180. {
  181. auto linearization_dict = m_linearization_dictionary.value();
  182. auto primary_offset = linearization_dict.primary_hint_stream_offset;
  183. auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
  184. auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
  185. m_reader.move_to(offset);
  186. auto stream_indirect_value = parse_indirect_value();
  187. if (stream_indirect_value.is_error())
  188. return {};
  189. auto stream_value = stream_indirect_value.value()->value();
  190. if (!stream_value.has<NonnullRefPtr<Object>>())
  191. return {};
  192. auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
  193. if (!stream_object->is<StreamObject>())
  194. return {};
  195. return stream_object->cast<StreamObject>();
  196. };
  197. auto primary_hint_stream = parse_hint_table(primary_offset);
  198. if (!primary_hint_stream)
  199. return error("Invalid primary hint stream");
  200. RefPtr<StreamObject> overflow_hint_stream;
  201. if (overflow_offset != NumericLimits<u32>::max())
  202. overflow_hint_stream = parse_hint_table(overflow_offset);
  203. ByteBuffer possible_merged_stream_buffer;
  204. ReadonlyBytes hint_stream_bytes;
  205. if (overflow_hint_stream) {
  206. auto primary_size = primary_hint_stream->bytes().size();
  207. auto overflow_size = overflow_hint_stream->bytes().size();
  208. auto total_size = primary_size + overflow_size;
  209. auto buffer_result = ByteBuffer::create_uninitialized(total_size);
  210. if (buffer_result.is_error())
  211. return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" };
  212. possible_merged_stream_buffer = buffer_result.release_value();
  213. MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()));
  214. MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()));
  215. hint_stream_bytes = possible_merged_stream_buffer.bytes();
  216. } else {
  217. hint_stream_bytes = primary_hint_stream->bytes();
  218. }
  219. auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes));
  220. auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes);
  221. // FIXME: Do something with the hint tables
  222. return {};
  223. }
  224. PDFErrorOr<void> Parser::initialize_non_linearized_xref_table()
  225. {
  226. m_reader.move_to(m_reader.bytes().size() - 1);
  227. if (!navigate_to_before_eof_marker())
  228. return error("No EOF marker");
  229. if (!navigate_to_after_startxref())
  230. return error("No xref");
  231. m_reader.set_reading_forwards();
  232. auto xref_offset_value = parse_number();
  233. if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
  234. return error("Invalid xref offset");
  235. auto xref_offset = xref_offset_value.value().get<int>();
  236. m_reader.move_to(xref_offset);
  237. m_xref_table = TRY(parse_xref_table());
  238. m_trailer = TRY(parse_file_trailer());
  239. return {};
  240. }
  241. PDFErrorOr<NonnullRefPtr<XRefTable>> Parser::parse_xref_table()
  242. {
  243. if (!m_reader.matches("xref"))
  244. return error("Expected \"xref\"");
  245. m_reader.move_by(4);
  246. if (!consume_eol())
  247. return error("Expected newline after \"xref\"");
  248. auto table = adopt_ref(*new XRefTable());
  249. do {
  250. if (m_reader.matches("trailer"))
  251. return table;
  252. Vector<XRefEntry> entries;
  253. auto starting_index_value = TRY(parse_number());
  254. auto starting_index = starting_index_value.get<int>();
  255. auto object_count_value = TRY(parse_number());
  256. auto object_count = object_count_value.get<int>();
  257. for (int i = 0; i < object_count; i++) {
  258. auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
  259. m_reader.move_by(10);
  260. if (!consume(' '))
  261. return error("Malformed xref entry");
  262. auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
  263. m_reader.move_by(5);
  264. if (!consume(' '))
  265. return error("Malformed xref entry");
  266. auto letter = m_reader.read();
  267. if (letter != 'n' && letter != 'f')
  268. return error("Malformed xref entry");
  269. // The line ending sequence can be one of the following:
  270. // SP CR, SP LF, or CR LF
  271. if (m_reader.matches(' ')) {
  272. consume();
  273. auto ch = consume();
  274. if (ch != '\r' && ch != '\n')
  275. return error("Malformed xref entry");
  276. } else {
  277. if (!m_reader.matches("\r\n"))
  278. return error("Malformed xref entry");
  279. m_reader.move_by(2);
  280. }
  281. auto offset = strtol(offset_string.characters(), nullptr, 10);
  282. auto generation = strtol(generation_string.characters(), nullptr, 10);
  283. entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
  284. }
  285. table->add_section({ starting_index, object_count, entries });
  286. } while (matches_number());
  287. return table;
  288. }
  289. PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_file_trailer()
  290. {
  291. while (matches_eol())
  292. consume_eol();
  293. if (!m_reader.matches("trailer"))
  294. return error("Expected \"trailer\" keyword");
  295. m_reader.move_by(7);
  296. consume_whitespace();
  297. auto dict = TRY(parse_dict());
  298. if (!m_reader.matches("startxref"))
  299. return error("Expected \"startxref\"");
  300. m_reader.move_by(9);
  301. consume_whitespace();
  302. m_reader.move_until([&](auto) { return matches_eol(); });
  303. VERIFY(consume_eol());
  304. if (!m_reader.matches("%%EOF"))
  305. return error("Expected \"%%EOF\"");
  306. m_reader.move_by(5);
  307. consume_whitespace();
  308. return dict;
  309. }
  310. PDFErrorOr<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
  311. {
  312. if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
  313. return error("Hint stream is too small");
  314. size_t offset = 0;
  315. auto read_u32 = [&] {
  316. u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
  317. offset += 4;
  318. return AK::convert_between_host_and_big_endian(data);
  319. };
  320. auto read_u16 = [&] {
  321. u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
  322. offset += 2;
  323. return AK::convert_between_host_and_big_endian(data);
  324. };
  325. PageOffsetHintTable hint_table {
  326. read_u32(),
  327. read_u32(),
  328. read_u16(),
  329. read_u32(),
  330. read_u16(),
  331. read_u32(),
  332. read_u16(),
  333. read_u32(),
  334. read_u16(),
  335. read_u16(),
  336. read_u16(),
  337. read_u16(),
  338. read_u16(),
  339. };
  340. // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
  341. // fields in PageOffsetHintTableEntry are u32
  342. VERIFY(hint_table.bits_required_for_object_number <= 32);
  343. VERIFY(hint_table.bits_required_for_page_length <= 32);
  344. VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
  345. VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
  346. VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
  347. VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
  348. VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
  349. return hint_table;
  350. }
  351. Vector<Parser::PageOffsetHintTableEntry> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
  352. {
  353. InputMemoryStream input_stream(hint_stream_bytes);
  354. input_stream.seek(sizeof(PageOffsetHintTable));
  355. InputBitStream bit_stream(input_stream);
  356. auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
  357. Vector<PageOffsetHintTableEntry> entries;
  358. for (size_t i = 0; i < number_of_pages; i++)
  359. entries.append(PageOffsetHintTableEntry {});
  360. auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
  361. auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
  362. auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
  363. auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
  364. auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
  365. auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
  366. auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
  367. auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
  368. if (bit_size <= 0)
  369. return;
  370. for (int i = 0; i < number_of_pages; i++) {
  371. auto& entry = entries[i];
  372. entry.*field = bit_stream.read_bits(bit_size);
  373. }
  374. };
  375. auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
  376. if (bit_size <= 0)
  377. return;
  378. for (int page = 1; page < number_of_pages; page++) {
  379. auto number_of_shared_objects = entries[page].number_of_shared_objects;
  380. Vector<u32> items;
  381. items.ensure_capacity(number_of_shared_objects);
  382. for (size_t i = 0; i < number_of_shared_objects; i++)
  383. items.unchecked_append(bit_stream.read_bits(bit_size));
  384. entries[page].*field = move(items);
  385. }
  386. };
  387. parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
  388. parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
  389. parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
  390. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
  391. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
  392. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
  393. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
  394. return entries;
  395. }
  396. bool Parser::navigate_to_before_eof_marker()
  397. {
  398. m_reader.set_reading_backwards();
  399. while (!m_reader.done()) {
  400. m_reader.move_until([&](auto) { return matches_eol(); });
  401. if (m_reader.done())
  402. return false;
  403. consume_eol();
  404. if (!m_reader.matches("%%EOF"))
  405. continue;
  406. m_reader.move_by(5);
  407. if (!matches_eol())
  408. continue;
  409. consume_eol();
  410. return true;
  411. }
  412. return false;
  413. }
  414. bool Parser::navigate_to_after_startxref()
  415. {
  416. m_reader.set_reading_backwards();
  417. while (!m_reader.done()) {
  418. m_reader.move_until([&](auto) { return matches_eol(); });
  419. auto offset = m_reader.offset() + 1;
  420. consume_eol();
  421. if (!m_reader.matches("startxref"))
  422. continue;
  423. m_reader.move_by(9);
  424. if (!matches_eol())
  425. continue;
  426. m_reader.move_to(offset);
  427. return true;
  428. }
  429. return false;
  430. }
  431. String Parser::parse_comment()
  432. {
  433. if (!m_reader.matches('%'))
  434. return {};
  435. consume();
  436. auto comment_start_offset = m_reader.offset();
  437. m_reader.move_until([&](auto) {
  438. return matches_eol();
  439. });
  440. String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
  441. consume_eol();
  442. consume_whitespace();
  443. return str;
  444. }
  445. PDFErrorOr<Value> Parser::parse_value()
  446. {
  447. parse_comment();
  448. if (m_reader.matches("null")) {
  449. m_reader.move_by(4);
  450. consume_whitespace();
  451. return Value(nullptr);
  452. }
  453. if (m_reader.matches("true")) {
  454. m_reader.move_by(4);
  455. consume_whitespace();
  456. return Value(true);
  457. }
  458. if (m_reader.matches("false")) {
  459. m_reader.move_by(5);
  460. consume_whitespace();
  461. return Value(false);
  462. }
  463. if (matches_number())
  464. return parse_possible_indirect_value_or_ref();
  465. if (m_reader.matches('/'))
  466. return MUST(parse_name());
  467. if (m_reader.matches("<<")) {
  468. auto dict = TRY(parse_dict());
  469. if (m_reader.matches("stream"))
  470. return TRY(parse_stream(dict));
  471. return dict;
  472. }
  473. if (m_reader.matches_any('(', '<'))
  474. return parse_string();
  475. if (m_reader.matches('['))
  476. return TRY(parse_array());
  477. return error(String::formatted("Unexpected char \"{}\"", m_reader.peek()));
  478. }
  479. PDFErrorOr<Value> Parser::parse_possible_indirect_value_or_ref()
  480. {
  481. auto first_number = TRY(parse_number());
  482. if (!matches_number())
  483. return first_number;
  484. m_reader.save();
  485. auto second_number = parse_number();
  486. if (second_number.is_error()) {
  487. m_reader.load();
  488. return first_number;
  489. }
  490. if (m_reader.matches('R')) {
  491. m_reader.discard();
  492. consume();
  493. consume_whitespace();
  494. return Value(Reference(first_number.get<int>(), second_number.value().get<int>()));
  495. }
  496. if (m_reader.matches("obj")) {
  497. m_reader.discard();
  498. return TRY(parse_indirect_value(first_number.get<int>(), second_number.value().get<int>()));
  499. }
  500. m_reader.load();
  501. return first_number;
  502. }
  503. PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value(int index, int generation)
  504. {
  505. if (!m_reader.matches("obj"))
  506. return error("Expected \"obj\" at beginning of indirect value");
  507. m_reader.move_by(3);
  508. if (matches_eol())
  509. consume_eol();
  510. auto value = TRY(parse_value());
  511. if (!m_reader.matches("endobj"))
  512. return error("Expected \"endobj\" at end of indirect value");
  513. consume(6);
  514. consume_whitespace();
  515. return make_object<IndirectValue>(index, generation, value);
  516. }
  517. PDFErrorOr<NonnullRefPtr<IndirectValue>> Parser::parse_indirect_value()
  518. {
  519. auto first_number = TRY(parse_number());
  520. auto second_number = TRY(parse_number());
  521. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  522. }
  523. PDFErrorOr<Value> Parser::parse_number()
  524. {
  525. size_t start_offset = m_reader.offset();
  526. bool is_float = false;
  527. bool consumed_digit = false;
  528. if (m_reader.matches('+') || m_reader.matches('-'))
  529. consume();
  530. while (!m_reader.done()) {
  531. if (m_reader.matches('.')) {
  532. if (is_float)
  533. break;
  534. is_float = true;
  535. consume();
  536. } else if (isdigit(m_reader.peek())) {
  537. consume();
  538. consumed_digit = true;
  539. } else {
  540. break;
  541. }
  542. }
  543. if (!consumed_digit)
  544. return error("Invalid number");
  545. consume_whitespace();
  546. auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
  547. float f = strtof(string.characters(), nullptr);
  548. if (is_float)
  549. return Value(f);
  550. VERIFY(floorf(f) == f);
  551. return Value(static_cast<int>(f));
  552. }
  553. PDFErrorOr<NonnullRefPtr<NameObject>> Parser::parse_name()
  554. {
  555. if (!consume('/'))
  556. return error("Expected Name object to start with \"/\"");
  557. StringBuilder builder;
  558. while (true) {
  559. if (!matches_regular_character())
  560. break;
  561. if (m_reader.matches('#')) {
  562. int hex_value = 0;
  563. for (int i = 0; i < 2; i++) {
  564. auto ch = consume();
  565. VERIFY(isxdigit(ch));
  566. hex_value *= 16;
  567. if (ch <= '9') {
  568. hex_value += ch - '0';
  569. } else {
  570. hex_value += ch - 'A' + 10;
  571. }
  572. }
  573. builder.append(static_cast<char>(hex_value));
  574. continue;
  575. }
  576. builder.append(consume());
  577. }
  578. consume_whitespace();
  579. return make_object<NameObject>(builder.to_string());
  580. }
  581. NonnullRefPtr<StringObject> Parser::parse_string()
  582. {
  583. ScopeGuard guard([&] { consume_whitespace(); });
  584. String string;
  585. bool is_binary_string;
  586. if (m_reader.matches('(')) {
  587. string = parse_literal_string();
  588. is_binary_string = false;
  589. } else {
  590. string = parse_hex_string();
  591. is_binary_string = true;
  592. }
  593. VERIFY(!string.is_null());
  594. if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
  595. // The string is encoded in UTF16-BE
  596. string = TextCodec::decoder_for("utf-16be")->to_utf8(string);
  597. } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
  598. // The string is encoded in UTF-8. This is the default anyways, but if these bytes
  599. // are explicitly included, we have to trim them
  600. string = string.substring(3);
  601. }
  602. return make_object<StringObject>(string, is_binary_string);
  603. }
  604. String Parser::parse_literal_string()
  605. {
  606. VERIFY(consume('('));
  607. StringBuilder builder;
  608. auto opened_parens = 0;
  609. while (true) {
  610. if (m_reader.matches('(')) {
  611. opened_parens++;
  612. builder.append(consume());
  613. } else if (m_reader.matches(')')) {
  614. consume();
  615. if (opened_parens == 0)
  616. break;
  617. opened_parens--;
  618. builder.append(')');
  619. } else if (m_reader.matches('\\')) {
  620. consume();
  621. if (matches_eol()) {
  622. consume_eol();
  623. continue;
  624. }
  625. if (m_reader.done())
  626. return {};
  627. auto ch = consume();
  628. switch (ch) {
  629. case 'n':
  630. builder.append('\n');
  631. break;
  632. case 'r':
  633. builder.append('\r');
  634. break;
  635. case 't':
  636. builder.append('\t');
  637. break;
  638. case 'b':
  639. builder.append('\b');
  640. break;
  641. case 'f':
  642. builder.append('\f');
  643. break;
  644. case '(':
  645. builder.append('(');
  646. break;
  647. case ')':
  648. builder.append(')');
  649. break;
  650. case '\\':
  651. builder.append('\\');
  652. break;
  653. default: {
  654. if (ch >= '0' && ch <= '7') {
  655. int octal_value = ch - '0';
  656. for (int i = 0; i < 2; i++) {
  657. auto octal_ch = consume();
  658. if (octal_ch < '0' || octal_ch > '7')
  659. break;
  660. octal_value = octal_value * 8 + (octal_ch - '0');
  661. }
  662. builder.append(static_cast<char>(octal_value));
  663. } else {
  664. builder.append(ch);
  665. }
  666. }
  667. }
  668. } else if (matches_eol()) {
  669. consume_eol();
  670. builder.append('\n');
  671. } else {
  672. builder.append(consume());
  673. }
  674. }
  675. return builder.to_string();
  676. }
  677. String Parser::parse_hex_string()
  678. {
  679. VERIFY(consume('<'));
  680. StringBuilder builder;
  681. while (true) {
  682. if (m_reader.matches('>')) {
  683. consume();
  684. return builder.to_string();
  685. } else {
  686. int hex_value = 0;
  687. for (int i = 0; i < 2; i++) {
  688. auto ch = consume();
  689. if (ch == '>') {
  690. // The hex string contains an odd number of characters, and the last character
  691. // is assumed to be '0'
  692. consume();
  693. hex_value *= 16;
  694. builder.append(static_cast<char>(hex_value));
  695. return builder.to_string();
  696. }
  697. VERIFY(isxdigit(ch));
  698. hex_value *= 16;
  699. if (ch <= '9') {
  700. hex_value += ch - '0';
  701. } else if (ch >= 'A' && ch <= 'F') {
  702. hex_value += ch - 'A' + 10;
  703. } else {
  704. hex_value += ch - 'a' + 10;
  705. }
  706. }
  707. builder.append(static_cast<char>(hex_value));
  708. }
  709. }
  710. }
  711. PDFErrorOr<NonnullRefPtr<ArrayObject>> Parser::parse_array()
  712. {
  713. if (!consume('['))
  714. return error("Expected array to start with \"[\"");
  715. consume_whitespace();
  716. Vector<Value> values;
  717. while (!m_reader.matches(']'))
  718. values.append(TRY(parse_value()));
  719. VERIFY(consume(']'));
  720. consume_whitespace();
  721. return make_object<ArrayObject>(values);
  722. }
  723. PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_dict()
  724. {
  725. if (!consume('<') || !consume('<'))
  726. return error("Expected dict to start with \"<<\"");
  727. consume_whitespace();
  728. HashMap<FlyString, Value> map;
  729. while (!m_reader.done()) {
  730. if (m_reader.matches(">>"))
  731. break;
  732. auto name = TRY(parse_name())->name();
  733. auto value = TRY(parse_value());
  734. map.set(name, value);
  735. }
  736. if (!consume('>') || !consume('>'))
  737. return error("Expected dict to end with \">>\"");
  738. consume_whitespace();
  739. return make_object<DictObject>(map);
  740. }
  741. PDFErrorOr<RefPtr<DictObject>> Parser::conditionally_parse_page_tree_node(u32 object_index)
  742. {
  743. VERIFY(m_xref_table->has_object(object_index));
  744. auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
  745. m_reader.move_to(byte_offset);
  746. TRY(parse_number());
  747. TRY(parse_number());
  748. if (!m_reader.matches("obj"))
  749. return error(String::formatted("Invalid page tree offset {}", object_index));
  750. m_reader.move_by(3);
  751. consume_whitespace();
  752. VERIFY(consume('<') && consume('<'));
  753. consume_whitespace();
  754. HashMap<FlyString, Value> map;
  755. while (true) {
  756. if (m_reader.matches(">>"))
  757. break;
  758. auto name = TRY(parse_name());
  759. auto name_string = name->name();
  760. if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
  761. // This is a page, not a page tree node
  762. return RefPtr<DictObject> {};
  763. }
  764. auto value = TRY(parse_value());
  765. if (name_string == CommonNames::Type) {
  766. if (!value.has<NonnullRefPtr<Object>>())
  767. return RefPtr<DictObject> {};
  768. auto type_object = value.get<NonnullRefPtr<Object>>();
  769. if (!type_object->is<NameObject>())
  770. return RefPtr<DictObject> {};
  771. auto type_name = type_object->cast<NameObject>();
  772. if (type_name->name() != CommonNames::Pages)
  773. return RefPtr<DictObject> {};
  774. }
  775. map.set(name->name(), value);
  776. }
  777. VERIFY(consume('>') && consume('>'));
  778. consume_whitespace();
  779. return make_object<DictObject>(map);
  780. }
  781. PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
  782. {
  783. if (!m_reader.matches("stream"))
  784. return error("Expected stream to start with \"stream\"");
  785. m_reader.move_by(6);
  786. if (!consume_eol())
  787. return error("Expected \"stream\" to be followed by a newline");
  788. ReadonlyBytes bytes;
  789. auto maybe_length = dict->get(CommonNames::Length);
  790. if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) {
  791. // The PDF writer has kindly provided us with the direct length of the stream
  792. m_reader.save();
  793. auto length = TRY(m_document->resolve_to<int>(maybe_length.value()));
  794. m_reader.load();
  795. bytes = m_reader.bytes().slice(m_reader.offset(), length);
  796. m_reader.move_by(length);
  797. consume_whitespace();
  798. } else {
  799. // We have to look for the endstream keyword
  800. auto stream_start = m_reader.offset();
  801. while (true) {
  802. m_reader.move_until([&](auto) { return matches_eol(); });
  803. auto potential_stream_end = m_reader.offset();
  804. consume_eol();
  805. if (!m_reader.matches("endstream"))
  806. continue;
  807. bytes = m_reader.bytes().slice(stream_start, potential_stream_end - stream_start);
  808. break;
  809. }
  810. }
  811. m_reader.move_by(9);
  812. consume_whitespace();
  813. if (dict->contains(CommonNames::Filter)) {
  814. auto filter_type = MUST(dict->get_name(m_document, CommonNames::Filter))->name();
  815. auto maybe_bytes = Filter::decode(bytes, filter_type);
  816. if (maybe_bytes.is_error()) {
  817. warnln("Failed to decode filter: {}", maybe_bytes.error().string_literal());
  818. return error(String::formatted("Failed to decode filter {}", maybe_bytes.error().string_literal()));
  819. }
  820. return make_object<StreamObject>(dict, maybe_bytes.value());
  821. }
  822. return make_object<StreamObject>(dict, MUST(ByteBuffer::copy(bytes)));
  823. }
  824. PDFErrorOr<Vector<Command>> Parser::parse_graphics_commands()
  825. {
  826. Vector<Command> commands;
  827. Vector<Value> command_args;
  828. constexpr static auto is_command_char = [](char ch) {
  829. return isalpha(ch) || ch == '*' || ch == '\'';
  830. };
  831. while (!m_reader.done()) {
  832. auto ch = m_reader.peek();
  833. if (is_command_char(ch)) {
  834. auto command_start = m_reader.offset();
  835. while (is_command_char(ch)) {
  836. consume();
  837. if (m_reader.done())
  838. break;
  839. ch = m_reader.peek();
  840. }
  841. auto command_string = StringView(m_reader.bytes().slice(command_start, m_reader.offset() - command_start));
  842. auto command_type = Command::command_type_from_symbol(command_string);
  843. commands.append(Command(command_type, move(command_args)));
  844. command_args = Vector<Value>();
  845. consume_whitespace();
  846. continue;
  847. }
  848. command_args.append(TRY(parse_value()));
  849. }
  850. return commands;
  851. }
  852. bool Parser::matches_eol() const
  853. {
  854. return m_reader.matches_any(0xa, 0xd);
  855. }
  856. bool Parser::matches_whitespace() const
  857. {
  858. return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
  859. }
  860. bool Parser::matches_number() const
  861. {
  862. if (m_reader.done())
  863. return false;
  864. auto ch = m_reader.peek();
  865. return isdigit(ch) || ch == '-' || ch == '+';
  866. }
  867. bool Parser::matches_delimiter() const
  868. {
  869. return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%');
  870. }
  871. bool Parser::matches_regular_character() const
  872. {
  873. return !matches_delimiter() && !matches_whitespace();
  874. }
  875. bool Parser::consume_eol()
  876. {
  877. if (m_reader.done()) {
  878. return false;
  879. }
  880. if (m_reader.matches("\r\n")) {
  881. consume(2);
  882. return true;
  883. }
  884. auto consumed = consume();
  885. return consumed == 0xd || consumed == 0xa;
  886. }
  887. bool Parser::consume_whitespace()
  888. {
  889. bool consumed = false;
  890. while (matches_whitespace()) {
  891. consumed = true;
  892. consume();
  893. }
  894. return consumed;
  895. }
  896. char Parser::consume()
  897. {
  898. return m_reader.read();
  899. }
  900. void Parser::consume(int amount)
  901. {
  902. for (size_t i = 0; i < static_cast<size_t>(amount); i++)
  903. consume();
  904. }
  905. bool Parser::consume(char ch)
  906. {
  907. return consume() == ch;
  908. }
  909. Error Parser::error(
  910. String const& message
  911. #ifdef PDF_DEBUG
  912. ,
  913. SourceLocation loc
  914. #endif
  915. ) const
  916. {
  917. #ifdef PDF_DEBUG
  918. dbgln("\033[31m{} Parser error at offset {}: {}\033[0m", loc, m_reader.offset(), message);
  919. #endif
  920. return Error { Error::Type::Parse, message };
  921. }
  922. }
  923. namespace AK {
  924. template<>
  925. struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
  926. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict)
  927. {
  928. StringBuilder builder;
  929. builder.append("{\n");
  930. builder.appendff(" length_of_file={}\n", dict.length_of_file);
  931. builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
  932. builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
  933. builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
  934. builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
  935. builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
  936. builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
  937. builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
  938. builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
  939. builder.appendff(" first_page={}\n", dict.first_page);
  940. builder.append('}');
  941. return Formatter<StringView>::format(format_builder, builder.to_string());
  942. }
  943. };
  944. template<>
  945. struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
  946. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table)
  947. {
  948. StringBuilder builder;
  949. builder.append("{\n");
  950. builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
  951. builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
  952. builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
  953. builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
  954. builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
  955. builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
  956. builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
  957. builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
  958. builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
  959. builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
  960. builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
  961. builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
  962. builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
  963. builder.append('}');
  964. return Formatter<StringView>::format(format_builder, builder.to_string());
  965. }
  966. };
  967. template<>
  968. struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
  969. ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry)
  970. {
  971. StringBuilder builder;
  972. builder.append("{\n");
  973. builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
  974. builder.appendff(" page_length_number={}\n", entry.page_length_number);
  975. builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
  976. builder.append(" shared_object_identifiers=[");
  977. for (auto& identifier : entry.shared_object_identifiers)
  978. builder.appendff(" {}", identifier);
  979. builder.append(" ]\n");
  980. builder.append(" shared_object_location_numerators=[");
  981. for (auto& numerator : entry.shared_object_location_numerators)
  982. builder.appendff(" {}", numerator);
  983. builder.append(" ]\n");
  984. builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
  985. builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
  986. builder.append('}');
  987. return Formatter<StringView>::format(format_builder, builder.to_string());
  988. }
  989. };
  990. }