Parser.cpp 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253
  1. /*
  2. * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/BitStream.h>
  7. #include <AK/MemoryStream.h>
  8. #include <AK/ScopeGuard.h>
  9. #include <AK/TypeCasts.h>
  10. #include <LibPDF/CommonNames.h>
  11. #include <LibPDF/Document.h>
  12. #include <LibPDF/Filter.h>
  13. #include <LibPDF/Parser.h>
  14. #include <LibTextCodec/Decoder.h>
  15. #include <ctype.h>
  16. namespace PDF {
  17. template<typename T, typename... Args>
  18. static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
  19. {
  20. return adopt_ref(*new T(forward<Args>(args)...));
  21. }
  22. Vector<Command> Parser::parse_graphics_commands(ReadonlyBytes const& bytes)
  23. {
  24. auto parser = adopt_ref(*new Parser(bytes));
  25. return parser->parse_graphics_commands();
  26. }
  27. Parser::Parser(Badge<Document>, ReadonlyBytes const& bytes)
  28. : m_reader(bytes)
  29. {
  30. }
  31. Parser::Parser(ReadonlyBytes const& bytes)
  32. : m_reader(bytes)
  33. {
  34. }
  35. void Parser::set_document(RefPtr<Document> const& document)
  36. {
  37. m_document = document;
  38. }
  39. bool Parser::initialize()
  40. {
  41. if (!parse_header())
  42. return {};
  43. const auto result = initialize_linearization_dict();
  44. if (result == LinearizationResult::Error)
  45. return {};
  46. if (result == LinearizationResult::NotLinearized)
  47. return initialize_non_linearized_xref_table();
  48. bool is_linearized = m_linearization_dictionary.has_value();
  49. if (is_linearized) {
  50. // The file may have been linearized at one point, but could have been updated afterwards,
  51. // which means it is no longer a linearized PDF file.
  52. is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
  53. if (!is_linearized) {
  54. // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
  55. // split. This might take some tweaking to ensure correct behavior, which can be
  56. // implemented later.
  57. TODO();
  58. }
  59. }
  60. if (is_linearized)
  61. return initialize_linearized_xref_table();
  62. return initialize_non_linearized_xref_table();
  63. }
  64. Value Parser::parse_object_with_index(u32 index)
  65. {
  66. VERIFY(m_xref_table->has_object(index));
  67. auto byte_offset = m_xref_table->byte_offset_for_object(index);
  68. m_reader.move_to(byte_offset);
  69. auto indirect_value = parse_indirect_value();
  70. VERIFY(indirect_value);
  71. VERIFY(indirect_value->index() == index);
  72. return indirect_value->value();
  73. }
  74. bool Parser::parse_header()
  75. {
  76. // FIXME: Do something with the version?
  77. m_reader.set_reading_forwards();
  78. m_reader.move_to(0);
  79. if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
  80. return false;
  81. m_reader.move_by(5);
  82. char major_ver = m_reader.read();
  83. if (major_ver != '1' && major_ver != '2')
  84. return false;
  85. if (m_reader.read() != '.')
  86. return false;
  87. char minor_ver = m_reader.read();
  88. if (minor_ver < '0' || minor_ver > '7')
  89. return false;
  90. consume_eol();
  91. // Parse optional high-byte comment, which signifies a binary file
  92. // FIXME: Do something with this?
  93. auto comment = parse_comment();
  94. if (!comment.is_empty()) {
  95. auto binary = comment.length() >= 4;
  96. if (binary) {
  97. for (size_t i = 0; i < comment.length() && binary; i++)
  98. binary = static_cast<u8>(comment[i]) > 128;
  99. }
  100. }
  101. return true;
  102. }
  103. Parser::LinearizationResult Parser::initialize_linearization_dict()
  104. {
  105. // parse_header() is called immediately before this, so we are at the right location
  106. auto dict_value = m_document->resolve(parse_indirect_value());
  107. if (!dict_value.has<NonnullRefPtr<Object>>())
  108. return LinearizationResult::Error;
  109. auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
  110. if (!dict_object->is_dict())
  111. return LinearizationResult::NotLinearized;
  112. auto dict = object_cast<DictObject>(dict_object);
  113. if (!dict->contains(CommonNames::Linearized))
  114. return LinearizationResult::NotLinearized;
  115. if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
  116. return LinearizationResult::Error;
  117. auto length_of_file = dict->get_value(CommonNames::L);
  118. auto hint_table = dict->get_value(CommonNames::H);
  119. auto first_page_object_number = dict->get_value(CommonNames::O);
  120. auto offset_of_first_page_end = dict->get_value(CommonNames::E);
  121. auto number_of_pages = dict->get_value(CommonNames::N);
  122. auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
  123. auto first_page = dict->get(CommonNames::P).value_or({});
  124. // Validation
  125. if (!length_of_file.has_u32()
  126. || !hint_table.has<NonnullRefPtr<Object>>()
  127. || !first_page_object_number.has_u32()
  128. || !number_of_pages.has_u16()
  129. || !offset_of_main_xref_table.has_u32()
  130. || (!first_page.has<Empty>() && !first_page.has_u32())) {
  131. return LinearizationResult::Error;
  132. }
  133. auto hint_table_object = hint_table.get<NonnullRefPtr<Object>>();
  134. if (!hint_table_object->is_array())
  135. return LinearizationResult::Error;
  136. auto hint_table_array = object_cast<ArrayObject>(hint_table_object);
  137. auto hint_table_size = hint_table_array->size();
  138. if (hint_table_size != 2 && hint_table_size != 4)
  139. return LinearizationResult::Error;
  140. auto primary_hint_stream_offset = hint_table_array->at(0);
  141. auto primary_hint_stream_length = hint_table_array->at(1);
  142. Value overflow_hint_stream_offset;
  143. Value overflow_hint_stream_length;
  144. if (hint_table_size == 4) {
  145. overflow_hint_stream_offset = hint_table_array->at(2);
  146. overflow_hint_stream_length = hint_table_array->at(3);
  147. }
  148. if (!primary_hint_stream_offset.has_u32()
  149. || !primary_hint_stream_length.has_u32()
  150. || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
  151. || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
  152. return LinearizationResult::Error;
  153. }
  154. m_linearization_dictionary = LinearizationDictionary {
  155. length_of_file.get_u32(),
  156. primary_hint_stream_offset.get_u32(),
  157. primary_hint_stream_length.get_u32(),
  158. overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
  159. overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
  160. first_page_object_number.get_u32(),
  161. offset_of_first_page_end.get_u32(),
  162. number_of_pages.get_u16(),
  163. offset_of_main_xref_table.get_u32(),
  164. first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
  165. };
  166. return LinearizationResult::Linearized;
  167. }
  168. bool Parser::initialize_linearized_xref_table()
  169. {
  170. // The linearization parameter dictionary has just been parsed, and the xref table
  171. // comes immediately after it. We are in the correct spot.
  172. if (!m_reader.matches("xref"))
  173. return false;
  174. m_xref_table = parse_xref_table();
  175. if (!m_xref_table)
  176. return false;
  177. m_trailer = parse_file_trailer();
  178. if (!m_trailer)
  179. return false;
  180. // Also parse the main xref table and merge into the first-page xref table. Note
  181. // that we don't use the main xref table offset from the linearization dict because
  182. // for some reason, it specified the offset of the whitespace after the object
  183. // index start and length? So it's much easier to do it this way.
  184. auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
  185. m_reader.move_to(main_xref_table_offset);
  186. auto main_xref_table = parse_xref_table();
  187. if (!main_xref_table)
  188. return false;
  189. return m_xref_table->merge(move(*main_xref_table));
  190. }
  191. bool Parser::initialize_hint_tables()
  192. {
  193. auto linearization_dict = m_linearization_dictionary.value();
  194. auto primary_offset = linearization_dict.primary_hint_stream_offset;
  195. auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
  196. auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
  197. m_reader.move_to(offset);
  198. auto stream_indirect_value = parse_indirect_value();
  199. if (!stream_indirect_value)
  200. return {};
  201. auto stream_value = stream_indirect_value->value();
  202. if (!stream_value.has<NonnullRefPtr<Object>>())
  203. return {};
  204. auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
  205. if (!stream_object->is_stream())
  206. return {};
  207. return object_cast<StreamObject>(stream_object);
  208. };
  209. auto primary_hint_stream = parse_hint_table(primary_offset);
  210. if (!primary_hint_stream)
  211. return false;
  212. RefPtr<StreamObject> overflow_hint_stream;
  213. if (overflow_offset != NumericLimits<u32>::max())
  214. overflow_hint_stream = parse_hint_table(overflow_offset);
  215. ByteBuffer possible_merged_stream_buffer;
  216. ReadonlyBytes hint_stream_bytes;
  217. if (overflow_hint_stream) {
  218. auto primary_size = primary_hint_stream->bytes().size();
  219. auto overflow_size = overflow_hint_stream->bytes().size();
  220. auto total_size = primary_size + overflow_size;
  221. auto buffer_result = ByteBuffer::create_uninitialized(total_size);
  222. if (!buffer_result.has_value())
  223. return false;
  224. possible_merged_stream_buffer = buffer_result.release_value();
  225. auto ok = possible_merged_stream_buffer.try_append(primary_hint_stream->bytes());
  226. ok = ok && possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes());
  227. if (!ok)
  228. return false;
  229. hint_stream_bytes = possible_merged_stream_buffer.bytes();
  230. } else {
  231. hint_stream_bytes = primary_hint_stream->bytes();
  232. }
  233. auto hint_table = parse_page_offset_hint_table(hint_stream_bytes);
  234. if (!hint_table.has_value())
  235. return false;
  236. dbgln("hint table: {}", hint_table.value());
  237. auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table.value(), hint_stream_bytes);
  238. if (!hint_table_entries.has_value())
  239. return false;
  240. auto entries = hint_table_entries.value();
  241. dbgln("hint table entries size: {}", entries.size());
  242. for (auto& entry : entries)
  243. dbgln("{}", entry);
  244. return true;
  245. }
  246. bool Parser::initialize_non_linearized_xref_table()
  247. {
  248. m_reader.move_to(m_reader.bytes().size() - 1);
  249. if (!navigate_to_before_eof_marker())
  250. return false;
  251. if (!navigate_to_after_startxref())
  252. return false;
  253. if (m_reader.done())
  254. return false;
  255. m_reader.set_reading_forwards();
  256. auto xref_offset_value = parse_number();
  257. if (!xref_offset_value.has<int>())
  258. return false;
  259. auto xref_offset = xref_offset_value.get<int>();
  260. m_reader.move_to(xref_offset);
  261. m_xref_table = parse_xref_table();
  262. if (!m_xref_table)
  263. return false;
  264. m_trailer = parse_file_trailer();
  265. return m_trailer;
  266. }
  267. RefPtr<XRefTable> Parser::parse_xref_table()
  268. {
  269. if (!m_reader.matches("xref"))
  270. return {};
  271. m_reader.move_by(4);
  272. if (!consume_eol())
  273. return {};
  274. auto table = adopt_ref(*new XRefTable());
  275. while (true) {
  276. if (m_reader.matches("trailer"))
  277. return table;
  278. Vector<XRefEntry> entries;
  279. auto starting_index_value = parse_number();
  280. auto starting_index = starting_index_value.get<int>();
  281. auto object_count_value = parse_number();
  282. auto object_count = object_count_value.get<int>();
  283. for (int i = 0; i < object_count; i++) {
  284. auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
  285. m_reader.move_by(10);
  286. if (!consume(' '))
  287. return {};
  288. auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
  289. m_reader.move_by(5);
  290. if (!consume(' '))
  291. return {};
  292. auto letter = m_reader.read();
  293. if (letter != 'n' && letter != 'f')
  294. return {};
  295. // The line ending sequence can be one of the following:
  296. // SP CR, SP LF, or CR LF
  297. if (m_reader.matches(' ')) {
  298. consume();
  299. auto ch = consume();
  300. if (ch != '\r' && ch != '\n')
  301. return {};
  302. } else {
  303. if (!m_reader.matches("\r\n"))
  304. return {};
  305. m_reader.move_by(2);
  306. }
  307. auto offset = strtol(offset_string.characters(), nullptr, 10);
  308. auto generation = strtol(generation_string.characters(), nullptr, 10);
  309. entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
  310. }
  311. table->add_section({ starting_index, object_count, entries });
  312. }
  313. }
  314. RefPtr<DictObject> Parser::parse_file_trailer()
  315. {
  316. if (!m_reader.matches("trailer"))
  317. return {};
  318. m_reader.move_by(7);
  319. consume_whitespace();
  320. auto dict = parse_dict();
  321. if (!dict)
  322. return {};
  323. if (!m_reader.matches("startxref"))
  324. return {};
  325. m_reader.move_by(9);
  326. consume_whitespace();
  327. m_reader.move_until([&](auto) { return matches_eol(); });
  328. VERIFY(consume_eol());
  329. if (!m_reader.matches("%%EOF"))
  330. return {};
  331. m_reader.move_by(5);
  332. consume_whitespace();
  333. return dict;
  334. }
  335. Optional<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes const& hint_stream_bytes)
  336. {
  337. if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
  338. return {};
  339. size_t offset = 0;
  340. auto read_u32 = [&] {
  341. u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
  342. offset += 4;
  343. return AK::convert_between_host_and_big_endian(data);
  344. };
  345. auto read_u16 = [&] {
  346. u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
  347. offset += 2;
  348. return AK::convert_between_host_and_big_endian(data);
  349. };
  350. PageOffsetHintTable hint_table {
  351. read_u32(),
  352. read_u32(),
  353. read_u16(),
  354. read_u32(),
  355. read_u16(),
  356. read_u32(),
  357. read_u16(),
  358. read_u32(),
  359. read_u16(),
  360. read_u16(),
  361. read_u16(),
  362. read_u16(),
  363. read_u16(),
  364. };
  365. // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
  366. // fields in PageOffsetHintTableEntry are u32
  367. VERIFY(hint_table.bits_required_for_object_number <= 32);
  368. VERIFY(hint_table.bits_required_for_page_length <= 32);
  369. VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
  370. VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
  371. VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
  372. VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
  373. VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
  374. return hint_table;
  375. }
  376. Optional<Vector<Parser::PageOffsetHintTableEntry>> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes const& hint_stream_bytes)
  377. {
  378. InputMemoryStream input_stream(hint_stream_bytes);
  379. input_stream.seek(sizeof(PageOffsetHintTable));
  380. if (input_stream.has_any_error())
  381. return {};
  382. InputBitStream bit_stream(input_stream);
  383. auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
  384. Vector<PageOffsetHintTableEntry> entries;
  385. for (size_t i = 0; i < number_of_pages; i++)
  386. entries.append(PageOffsetHintTableEntry {});
  387. auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
  388. auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
  389. auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
  390. auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
  391. auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
  392. auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
  393. auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
  394. auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
  395. if (bit_size <= 0)
  396. return;
  397. for (int i = 0; i < number_of_pages; i++) {
  398. auto& entry = entries[i];
  399. entry.*field = bit_stream.read_bits(bit_size);
  400. }
  401. };
  402. auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
  403. if (bit_size <= 0)
  404. return;
  405. for (int page = 1; page < number_of_pages; page++) {
  406. auto number_of_shared_objects = entries[page].number_of_shared_objects;
  407. Vector<u32> items;
  408. items.ensure_capacity(number_of_shared_objects);
  409. for (size_t i = 0; i < number_of_shared_objects; i++)
  410. items.unchecked_append(bit_stream.read_bits(bit_size));
  411. entries[page].*field = move(items);
  412. }
  413. };
  414. parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
  415. parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
  416. parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
  417. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
  418. parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
  419. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
  420. parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
  421. return entries;
  422. }
  423. bool Parser::navigate_to_before_eof_marker()
  424. {
  425. m_reader.set_reading_backwards();
  426. while (!m_reader.done()) {
  427. m_reader.move_until([&](auto) { return matches_eol(); });
  428. if (m_reader.done())
  429. return false;
  430. consume_eol();
  431. if (!m_reader.matches("%%EOF"))
  432. continue;
  433. m_reader.move_by(5);
  434. if (!matches_eol())
  435. continue;
  436. consume_eol();
  437. return true;
  438. }
  439. return false;
  440. }
  441. bool Parser::navigate_to_after_startxref()
  442. {
  443. m_reader.set_reading_backwards();
  444. while (!m_reader.done()) {
  445. m_reader.move_until([&](auto) { return matches_eol(); });
  446. auto offset = m_reader.offset() + 1;
  447. consume_eol();
  448. if (!m_reader.matches("startxref"))
  449. continue;
  450. m_reader.move_by(9);
  451. if (!matches_eol())
  452. continue;
  453. m_reader.move_to(offset);
  454. return true;
  455. }
  456. return false;
  457. }
  458. bool Parser::sloppy_is_linearized()
  459. {
  460. ScopeGuard guard([&] {
  461. m_reader.move_to(0);
  462. m_reader.set_reading_forwards();
  463. });
  464. auto limit = min(1024ul, m_reader.bytes().size() - 1);
  465. m_reader.move_to(limit);
  466. m_reader.set_reading_backwards();
  467. while (!m_reader.done()) {
  468. m_reader.move_until('/');
  469. if (m_reader.matches("/Linearized"))
  470. return true;
  471. m_reader.move_by(1);
  472. }
  473. return false;
  474. }
  475. String Parser::parse_comment()
  476. {
  477. if (!m_reader.matches('%'))
  478. return {};
  479. consume();
  480. auto comment_start_offset = m_reader.offset();
  481. m_reader.move_until([&](auto) {
  482. return matches_eol();
  483. });
  484. String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
  485. consume_eol();
  486. consume_whitespace();
  487. return str;
  488. }
  489. Value Parser::parse_value()
  490. {
  491. parse_comment();
  492. if (m_reader.matches("null")) {
  493. m_reader.move_by(4);
  494. consume_whitespace();
  495. return Value(nullptr);
  496. }
  497. if (m_reader.matches("true")) {
  498. m_reader.move_by(4);
  499. consume_whitespace();
  500. return Value(true);
  501. }
  502. if (m_reader.matches("false")) {
  503. m_reader.move_by(5);
  504. consume_whitespace();
  505. return Value(false);
  506. }
  507. if (matches_number())
  508. return parse_possible_indirect_value_or_ref();
  509. if (m_reader.matches('/'))
  510. return parse_name();
  511. if (m_reader.matches("<<")) {
  512. auto dict = parse_dict();
  513. if (!dict)
  514. return {};
  515. if (m_reader.matches("stream"))
  516. return parse_stream(dict.release_nonnull());
  517. return dict;
  518. }
  519. if (m_reader.matches_any('(', '<'))
  520. return parse_string();
  521. if (m_reader.matches('['))
  522. return parse_array();
  523. dbgln("tried to parse value, but found char {} ({}) at offset {}", m_reader.peek(), static_cast<u8>(m_reader.peek()), m_reader.offset());
  524. VERIFY_NOT_REACHED();
  525. }
  526. Value Parser::parse_possible_indirect_value_or_ref()
  527. {
  528. auto first_number = parse_number();
  529. if (!first_number.has<int>() || !matches_number())
  530. return first_number;
  531. m_reader.save();
  532. auto second_number = parse_number();
  533. if (!second_number.has<int>()) {
  534. m_reader.load();
  535. return first_number;
  536. }
  537. if (m_reader.matches('R')) {
  538. m_reader.discard();
  539. consume();
  540. consume_whitespace();
  541. return Value(Reference(first_number.get<int>(), second_number.get<int>()));
  542. }
  543. if (m_reader.matches("obj")) {
  544. m_reader.discard();
  545. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  546. }
  547. m_reader.load();
  548. return first_number;
  549. }
  550. RefPtr<IndirectValue> Parser::parse_indirect_value(int index, int generation)
  551. {
  552. if (!m_reader.matches("obj"))
  553. return {};
  554. m_reader.move_by(3);
  555. if (matches_eol())
  556. consume_eol();
  557. auto value = parse_value();
  558. if (!m_reader.matches("endobj"))
  559. return {};
  560. consume(6);
  561. consume_whitespace();
  562. return make_object<IndirectValue>(index, generation, value);
  563. }
  564. RefPtr<IndirectValue> Parser::parse_indirect_value()
  565. {
  566. auto first_number = parse_number();
  567. if (!first_number.has<int>())
  568. return {};
  569. auto second_number = parse_number();
  570. if (!second_number.has<int>())
  571. return {};
  572. return parse_indirect_value(first_number.get<int>(), second_number.get<int>());
  573. }
  574. Value Parser::parse_number()
  575. {
  576. size_t start_offset = m_reader.offset();
  577. bool is_float = false;
  578. if (m_reader.matches('+') || m_reader.matches('-'))
  579. consume();
  580. while (!m_reader.done()) {
  581. if (m_reader.matches('.')) {
  582. if (is_float)
  583. break;
  584. is_float = true;
  585. consume();
  586. } else if (isdigit(m_reader.peek())) {
  587. consume();
  588. } else {
  589. break;
  590. }
  591. }
  592. consume_whitespace();
  593. auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
  594. float f = strtof(string.characters(), nullptr);
  595. if (is_float)
  596. return Value(f);
  597. VERIFY(floorf(f) == f);
  598. return Value(static_cast<int>(f));
  599. }
  600. RefPtr<NameObject> Parser::parse_name()
  601. {
  602. if (!consume('/'))
  603. return {};
  604. StringBuilder builder;
  605. while (true) {
  606. if (!matches_regular_character())
  607. break;
  608. if (m_reader.matches('#')) {
  609. int hex_value = 0;
  610. for (int i = 0; i < 2; i++) {
  611. auto ch = consume();
  612. if (!isxdigit(ch))
  613. return {};
  614. hex_value *= 16;
  615. if (ch <= '9') {
  616. hex_value += ch - '0';
  617. } else {
  618. hex_value += ch - 'A' + 10;
  619. }
  620. }
  621. builder.append(static_cast<char>(hex_value));
  622. continue;
  623. }
  624. builder.append(consume());
  625. }
  626. consume_whitespace();
  627. return make_object<NameObject>(builder.to_string());
  628. }
  629. RefPtr<StringObject> Parser::parse_string()
  630. {
  631. ScopeGuard guard([&] { consume_whitespace(); });
  632. String string;
  633. bool is_binary_string;
  634. if (m_reader.matches('(')) {
  635. string = parse_literal_string();
  636. is_binary_string = false;
  637. } else {
  638. string = parse_hex_string();
  639. is_binary_string = true;
  640. }
  641. if (string.is_null())
  642. return {};
  643. if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
  644. // The string is encoded in UTF16-BE
  645. string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
  646. } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
  647. // The string is encoded in UTF-8. This is the default anyways, but if these bytes
  648. // are explicitly included, we have to trim them
  649. string = string.substring(3);
  650. }
  651. return make_object<StringObject>(string, is_binary_string);
  652. }
  653. String Parser::parse_literal_string()
  654. {
  655. if (!consume('('))
  656. return {};
  657. StringBuilder builder;
  658. auto opened_parens = 0;
  659. while (true) {
  660. if (m_reader.matches('(')) {
  661. opened_parens++;
  662. builder.append(consume());
  663. } else if (m_reader.matches(')')) {
  664. consume();
  665. if (opened_parens == 0)
  666. break;
  667. opened_parens--;
  668. builder.append(')');
  669. } else if (m_reader.matches('\\')) {
  670. consume();
  671. if (matches_eol()) {
  672. consume_eol();
  673. continue;
  674. }
  675. if (m_reader.done())
  676. return {};
  677. auto ch = consume();
  678. switch (ch) {
  679. case 'n':
  680. builder.append('\n');
  681. break;
  682. case 'r':
  683. builder.append('\r');
  684. break;
  685. case 't':
  686. builder.append('\t');
  687. break;
  688. case 'b':
  689. builder.append('\b');
  690. break;
  691. case 'f':
  692. builder.append('\f');
  693. break;
  694. case '(':
  695. builder.append('(');
  696. break;
  697. case ')':
  698. builder.append(')');
  699. break;
  700. case '\\':
  701. builder.append('\\');
  702. break;
  703. default: {
  704. if (ch >= '0' && ch <= '7') {
  705. int octal_value = ch - '0';
  706. for (int i = 0; i < 2; i++) {
  707. auto octal_ch = consume();
  708. if (octal_ch < '0' || octal_ch > '7')
  709. break;
  710. octal_value = octal_value * 8 + (octal_ch - '0');
  711. }
  712. builder.append(static_cast<char>(octal_value));
  713. } else {
  714. builder.append(ch);
  715. }
  716. }
  717. }
  718. } else if (matches_eol()) {
  719. consume_eol();
  720. builder.append('\n');
  721. } else {
  722. builder.append(consume());
  723. }
  724. }
  725. if (opened_parens != 0)
  726. return {};
  727. return builder.to_string();
  728. }
  729. String Parser::parse_hex_string()
  730. {
  731. if (!consume('<'))
  732. return {};
  733. StringBuilder builder;
  734. while (true) {
  735. if (m_reader.matches('>')) {
  736. consume();
  737. return builder.to_string();
  738. } else {
  739. int hex_value = 0;
  740. for (int i = 0; i < 2; i++) {
  741. auto ch = consume();
  742. if (ch == '>') {
  743. // The hex string contains an odd number of characters, and the last character
  744. // is assumed to be '0'
  745. consume();
  746. hex_value *= 16;
  747. builder.append(static_cast<char>(hex_value));
  748. return builder.to_string();
  749. }
  750. if (!isxdigit(ch))
  751. return {};
  752. hex_value *= 16;
  753. if (ch <= '9') {
  754. hex_value += ch - '0';
  755. } else {
  756. hex_value += ch - 'A' + 10;
  757. }
  758. }
  759. builder.append(static_cast<char>(hex_value));
  760. }
  761. }
  762. }
  763. RefPtr<ArrayObject> Parser::parse_array()
  764. {
  765. if (!consume('['))
  766. return {};
  767. consume_whitespace();
  768. Vector<Value> values;
  769. while (!m_reader.matches(']')) {
  770. auto value = parse_value();
  771. if (value.has<Empty>())
  772. return {};
  773. values.append(value);
  774. }
  775. if (!consume(']'))
  776. return {};
  777. consume_whitespace();
  778. return make_object<ArrayObject>(values);
  779. }
  780. RefPtr<DictObject> Parser::parse_dict()
  781. {
  782. if (!consume('<') || !consume('<'))
  783. return {};
  784. consume_whitespace();
  785. HashMap<FlyString, Value> map;
  786. while (true) {
  787. if (m_reader.matches(">>"))
  788. break;
  789. auto name = parse_name();
  790. if (!name)
  791. return {};
  792. auto value = parse_value();
  793. if (value.has<Empty>())
  794. return {};
  795. map.set(name->name(), value);
  796. }
  797. if (!consume('>') || !consume('>'))
  798. return {};
  799. consume_whitespace();
  800. return make_object<DictObject>(map);
  801. }
  802. RefPtr<DictObject> Parser::conditionally_parse_page_tree_node(u32 object_index, bool& ok)
  803. {
  804. ok = true;
  805. VERIFY(m_xref_table->has_object(object_index));
  806. auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
  807. m_reader.move_to(byte_offset);
  808. parse_number();
  809. parse_number();
  810. if (!m_reader.matches("obj")) {
  811. ok = false;
  812. return {};
  813. }
  814. m_reader.move_by(3);
  815. consume_whitespace();
  816. if (!consume('<') || !consume('<'))
  817. return {};
  818. consume_whitespace();
  819. HashMap<FlyString, Value> map;
  820. while (true) {
  821. if (m_reader.matches(">>"))
  822. break;
  823. auto name = parse_name();
  824. if (!name) {
  825. ok = false;
  826. return {};
  827. }
  828. auto name_string = name->name();
  829. if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
  830. // This is a page, not a page tree node
  831. return {};
  832. }
  833. auto value = parse_value();
  834. if (value.has<Empty>()) {
  835. ok = false;
  836. return {};
  837. }
  838. if (name_string == CommonNames::Type) {
  839. if (!value.has<NonnullRefPtr<Object>>())
  840. return {};
  841. auto type_object = value.get<NonnullRefPtr<Object>>();
  842. if (!type_object->is_name())
  843. return {};
  844. auto type_name = object_cast<NameObject>(type_object);
  845. if (type_name->name() != CommonNames::Pages)
  846. return {};
  847. }
  848. map.set(name->name(), value);
  849. }
  850. if (!consume('>') || !consume('>'))
  851. return {};
  852. consume_whitespace();
  853. return make_object<DictObject>(map);
  854. }
  855. RefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
  856. {
  857. if (!m_reader.matches("stream"))
  858. return {};
  859. m_reader.move_by(6);
  860. if (!consume_eol())
  861. return {};
  862. ReadonlyBytes bytes;
  863. auto maybe_length = dict->get(CommonNames::Length);
  864. if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) {
  865. // The PDF writer has kindly provided us with the direct length of the stream
  866. m_reader.save();
  867. auto length = m_document->resolve_to<int>(maybe_length.value());
  868. m_reader.load();
  869. bytes = m_reader.bytes().slice(m_reader.offset(), length);
  870. m_reader.move_by(length);
  871. consume_whitespace();
  872. } else {
  873. // We have to look for the endstream keyword
  874. auto stream_start = m_reader.offset();
  875. while (true) {
  876. m_reader.move_until([&](auto) { return matches_eol(); });
  877. auto potential_stream_end = m_reader.offset();
  878. consume_eol();
  879. if (!m_reader.matches("endstream"))
  880. continue;
  881. bytes = m_reader.bytes().slice(stream_start, potential_stream_end - stream_start);
  882. break;
  883. }
  884. }
  885. m_reader.move_by(9);
  886. consume_whitespace();
  887. if (dict->contains(CommonNames::Filter)) {
  888. auto filter_type = dict->get_name(m_document, CommonNames::Filter)->name();
  889. auto maybe_bytes = Filter::decode(bytes, filter_type);
  890. if (!maybe_bytes.has_value())
  891. return {};
  892. return make_object<EncodedStreamObject>(dict, move(maybe_bytes.value()));
  893. }
  894. return make_object<PlainTextStreamObject>(dict, bytes);
  895. }
  896. Vector<Command> Parser::parse_graphics_commands()
  897. {
  898. Vector<Command> commands;
  899. Vector<Value> command_args;
  900. constexpr static auto is_command_char = [](char ch) {
  901. return isalpha(ch) || ch == '*' || ch == '\'';
  902. };
  903. while (!m_reader.done()) {
  904. auto ch = m_reader.peek();
  905. if (is_command_char(ch)) {
  906. auto command_start = m_reader.offset();
  907. while (is_command_char(ch)) {
  908. consume();
  909. if (m_reader.done())
  910. break;
  911. ch = m_reader.peek();
  912. }
  913. auto command_string = StringView(m_reader.bytes().slice(command_start, m_reader.offset() - command_start));
  914. auto command_type = Command::command_type_from_symbol(command_string);
  915. commands.append(Command(command_type, move(command_args)));
  916. command_args = Vector<Value>();
  917. consume_whitespace();
  918. continue;
  919. }
  920. command_args.append(parse_value());
  921. }
  922. return commands;
  923. }
  924. bool Parser::matches_eol() const
  925. {
  926. return m_reader.matches_any(0xa, 0xd);
  927. }
  928. bool Parser::matches_whitespace() const
  929. {
  930. return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
  931. }
  932. bool Parser::matches_number() const
  933. {
  934. if (m_reader.done())
  935. return false;
  936. auto ch = m_reader.peek();
  937. return isdigit(ch) || ch == '-' || ch == '+';
  938. }
  939. bool Parser::matches_delimiter() const
  940. {
  941. return m_reader.matches_any('(', ')', '<', '>', '[', ']', '{', '}', '/', '%');
  942. }
  943. bool Parser::matches_regular_character() const
  944. {
  945. return !matches_delimiter() && !matches_whitespace();
  946. }
  947. bool Parser::consume_eol()
  948. {
  949. if (m_reader.matches("\r\n")) {
  950. consume(2);
  951. return true;
  952. }
  953. auto consumed = consume();
  954. return consumed == 0xd || consumed == 0xa;
  955. }
  956. bool Parser::consume_whitespace()
  957. {
  958. bool consumed = false;
  959. while (matches_whitespace()) {
  960. consumed = true;
  961. consume();
  962. }
  963. return consumed;
  964. }
  965. char Parser::consume()
  966. {
  967. return m_reader.read();
  968. }
  969. void Parser::consume(int amount)
  970. {
  971. for (size_t i = 0; i < static_cast<size_t>(amount); i++)
  972. consume();
  973. }
  974. bool Parser::consume(char ch)
  975. {
  976. return consume() == ch;
  977. }
  978. }
  979. namespace AK {
  980. template<>
  981. struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
  982. void format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict)
  983. {
  984. StringBuilder builder;
  985. builder.append("{\n");
  986. builder.appendff(" length_of_file={}\n", dict.length_of_file);
  987. builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
  988. builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
  989. builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
  990. builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
  991. builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
  992. builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
  993. builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
  994. builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
  995. builder.appendff(" first_page={}\n", dict.first_page);
  996. builder.append('}');
  997. Formatter<StringView>::format(format_builder, builder.to_string());
  998. }
  999. };
  1000. template<>
  1001. struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
  1002. void format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table)
  1003. {
  1004. StringBuilder builder;
  1005. builder.append("{\n");
  1006. builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
  1007. builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
  1008. builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
  1009. builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
  1010. builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
  1011. builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
  1012. builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
  1013. builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
  1014. builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
  1015. builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
  1016. builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
  1017. builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
  1018. builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
  1019. builder.append('}');
  1020. Formatter<StringView>::format(format_builder, builder.to_string());
  1021. }
  1022. };
  1023. template<>
  1024. struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
  1025. void format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry)
  1026. {
  1027. StringBuilder builder;
  1028. builder.append("{\n");
  1029. builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
  1030. builder.appendff(" page_length_number={}\n", entry.page_length_number);
  1031. builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
  1032. builder.append(" shared_object_identifiers=[");
  1033. for (auto& identifier : entry.shared_object_identifiers)
  1034. builder.appendff(" {}", identifier);
  1035. builder.append(" ]\n");
  1036. builder.append(" shared_object_location_numerators=[");
  1037. for (auto& numerator : entry.shared_object_location_numerators)
  1038. builder.appendff(" {}", numerator);
  1039. builder.append(" ]\n");
  1040. builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
  1041. builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
  1042. builder.append('}');
  1043. Formatter<StringView>::format(format_builder, builder.to_string());
  1044. }
  1045. };
  1046. }