Parser.h 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. /*
  2. * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include <AK/NonnullRefPtrVector.h>
  8. #include <LibPDF/Command.h>
  9. #include <LibPDF/Object.h>
  10. #include <LibPDF/Reader.h>
  11. #include <LibPDF/XRefTable.h>
  12. namespace PDF {
  13. class Document;
  14. class Parser final : public RefCounted<Parser> {
  15. public:
  16. enum class LinearizationResult {
  17. Error,
  18. NotLinearized,
  19. Linearized,
  20. };
  21. static Vector<Command> parse_graphics_commands(ReadonlyBytes);
  22. Parser(Badge<Document>, ReadonlyBytes);
  23. [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
  24. void set_document(RefPtr<Document> const&);
  25. // Parses the header and initializes the xref table and trailer
  26. bool initialize();
  27. Value parse_object_with_index(u32 index);
  28. // Specialized version of parse_dict which aborts early if the dict being parsed
  29. // is not a page object. A null RefPtr return indicates that the dict at this index
  30. // is not a page tree node, whereas ok == false indicates a malformed PDF file and
  31. // should cause an abort of the current operation.
  32. RefPtr<DictObject> conditionally_parse_page_tree_node(u32 object_index, bool& ok);
  33. private:
  34. struct LinearizationDictionary {
  35. u32 length_of_file { 0 };
  36. u32 primary_hint_stream_offset { 0 };
  37. u32 primary_hint_stream_length { 0 };
  38. u32 overflow_hint_stream_offset { 0 };
  39. u32 overflow_hint_stream_length { 0 };
  40. u32 first_page_object_number { 0 };
  41. u32 offset_of_first_page_end { 0 };
  42. u16 number_of_pages { 0 };
  43. u32 offset_of_main_xref_table { 0 };
  44. u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
  45. };
  46. struct PageOffsetHintTable {
  47. u32 least_number_of_objects_in_a_page { 0 };
  48. u32 location_of_first_page_object { 0 };
  49. u16 bits_required_for_object_number { 0 };
  50. u32 least_length_of_a_page { 0 };
  51. u16 bits_required_for_page_length { 0 };
  52. u32 least_offset_of_any_content_stream { 0 };
  53. u16 bits_required_for_content_stream_offsets { 0 };
  54. u32 least_content_stream_length { 0 };
  55. u16 bits_required_for_content_stream_length { 0 };
  56. u16 bits_required_for_number_of_shared_obj_refs { 0 };
  57. u16 bits_required_for_greatest_shared_obj_identifier { 0 };
  58. u16 bits_required_for_fraction_numerator { 0 };
  59. u16 shared_object_reference_fraction_denominator { 0 };
  60. };
  61. struct PageOffsetHintTableEntry {
  62. u32 objects_in_page_number { 0 };
  63. u32 page_length_number { 0 };
  64. u32 number_of_shared_objects { 0 };
  65. Vector<u32> shared_object_identifiers {};
  66. Vector<u32> shared_object_location_numerators {};
  67. u32 page_content_stream_offset_number { 0 };
  68. u32 page_content_stream_length_number { 0 };
  69. };
  70. friend struct AK::Formatter<LinearizationDictionary>;
  71. friend struct AK::Formatter<PageOffsetHintTable>;
  72. friend struct AK::Formatter<PageOffsetHintTableEntry>;
  73. explicit Parser(ReadonlyBytes);
  74. bool parse_header();
  75. LinearizationResult initialize_linearization_dict();
  76. bool initialize_linearized_xref_table();
  77. bool initialize_non_linearized_xref_table();
  78. bool initialize_hint_tables();
  79. Optional<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
  80. Optional<Vector<PageOffsetHintTableEntry>> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);
  81. RefPtr<XRefTable> parse_xref_table();
  82. RefPtr<DictObject> parse_file_trailer();
  83. bool navigate_to_before_eof_marker();
  84. bool navigate_to_after_startxref();
  85. // If the PDF is linearized, the first object will be the linearization
  86. // parameter dictionary, and it will always occur within the first 1024 bytes.
  87. // We do a very sloppy and context-free search for this object. A return value
  88. // of true does not necessarily mean this PDF is linearized, but a return value
  89. // of false does mean this PDF is not linearized.
  90. // FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
  91. bool sloppy_is_linearized();
  92. String parse_comment();
  93. Value parse_value();
  94. Value parse_possible_indirect_value_or_ref();
  95. RefPtr<IndirectValue> parse_indirect_value(int index, int generation);
  96. RefPtr<IndirectValue> parse_indirect_value();
  97. Value parse_number();
  98. RefPtr<NameObject> parse_name();
  99. RefPtr<StringObject> parse_string();
  100. String parse_literal_string();
  101. String parse_hex_string();
  102. RefPtr<ArrayObject> parse_array();
  103. RefPtr<DictObject> parse_dict();
  104. RefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);
  105. Vector<Command> parse_graphics_commands();
  106. bool matches_eol() const;
  107. bool matches_whitespace() const;
  108. bool matches_number() const;
  109. bool matches_delimiter() const;
  110. bool matches_regular_character() const;
  111. bool consume_eol();
  112. bool consume_whitespace();
  113. char consume();
  114. void consume(int amount);
  115. bool consume(char);
  116. Reader m_reader;
  117. RefPtr<Document> m_document;
  118. RefPtr<XRefTable> m_xref_table;
  119. RefPtr<DictObject> m_trailer;
  120. Optional<LinearizationDictionary> m_linearization_dictionary;
  121. };
  122. };