LibPDF: Refine the distinction between the Document and Parser

The Parser should hold information relevant for parsing, whereas the
Document should hold information relevant for displaying pages.
With this in mind, there is no reason for the Document to hold the
xref table and trailer. These objects have been moved to the Parser,
which allows the Parser to expose less public methods (which will be
even more evident once linearized PDFs are supported).
This commit is contained in:
Matthew Olsson 2021-05-25 08:55:15 -07:00 committed by Ali Mohammad Pur
parent 69410d7f4e
commit 78bc9d1539
Notes: sideshowbarker 2024-07-18 12:21:26 +09:00
4 changed files with 43 additions and 56 deletions

View file

@ -39,17 +39,10 @@ RefPtr<Document> Document::create(const ReadonlyBytes& bytes)
auto parser = adopt_ref(*new Parser({}, bytes));
auto document = adopt_ref(*new Document(parser));
VERIFY(parser->perform_validation());
auto xref_table_and_trailer_opt = parser->parse_last_xref_table_and_trailer();
if (!xref_table_and_trailer_opt.has_value())
if (!parser->initialize())
return {};
auto [xref_table, trailer] = xref_table_and_trailer_opt.value();
document->m_xref_table = xref_table;
document->m_trailer = trailer;
document->m_catalog = document->m_trailer->get_dict(document, CommonNames::Root);
document->m_catalog = parser->trailer()->get_dict(document, CommonNames::Root);
document->build_page_tree();
document->build_outline();
@ -68,13 +61,9 @@ Value Document::get_or_load_value(u32 index)
if (value)
return value;
VERIFY(m_xref_table.has_object(index));
auto byte_offset = m_xref_table.byte_offset_for_object(index);
auto indirect_value = m_parser->parse_indirect_value_at_offset(byte_offset);
VERIFY(indirect_value->index() == index);
value = indirect_value->value();
m_values.set(index, value);
return value;
auto object = m_parser->parse_object_with_index(index);
m_values.set(index, object);
return object;
}
u32 Document::get_first_page_index() const
@ -179,9 +168,8 @@ bool Document::add_page_tree_node_to_page_tree(NonnullRefPtr<DictObject> page_tr
for (auto& value : *kids_array) {
auto reference_index = value.as_ref_index();
auto byte_offset = m_xref_table.byte_offset_for_object(reference_index);
bool ok;
auto maybe_page_tree_node = m_parser->conditionally_parse_page_tree_node_at_offset(byte_offset, ok);
auto maybe_page_tree_node = m_parser->conditionally_parse_page_tree_node(reference_index, ok);
if (!ok)
return false;
if (maybe_page_tree_node) {

View file

@ -75,8 +75,6 @@ class Document final : public RefCounted<Document> {
public:
static RefPtr<Document> create(const ReadonlyBytes& bytes);
ALWAYS_INLINE const XRefTable& xref_table() const { return m_xref_table; }
ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; }
ALWAYS_INLINE const RefPtr<OutlineDict>& outline() const { return m_outline; }
[[nodiscard]] Value get_or_load_value(u32 index);
@ -92,12 +90,6 @@ public:
return m_values.get(index).value_or({});
}
ALWAYS_INLINE void set_value(u32 index, const Value& value)
{
m_values.ensure_capacity(index);
m_values.set(index, value);
}
// Strips away the layer of indirection by turning indirect value
// refs into the value they reference, and indirect values into
// the value being wrapped.
@ -139,8 +131,6 @@ private:
NonnullRefPtrVector<OutlineItem> build_outline_item_chain(const Value& first_ref, const Value& last_ref);
NonnullRefPtr<Parser> m_parser;
XRefTable m_xref_table;
RefPtr<DictObject> m_trailer;
RefPtr<DictObject> m_catalog;
Vector<u32> m_page_object_indices;
HashMap<u32, Page> m_pages;

View file

@ -38,43 +38,47 @@ Parser::Parser(const ReadonlyBytes& bytes)
{
}
bool Parser::perform_validation()
bool Parser::initialize()
{
return !sloppy_is_linearized() && parse_header();
}
if (!parse_header())
return {};
Optional<Parser::XRefTableAndTrailer> Parser::parse_last_xref_table_and_trailer()
{
m_reader.move_to(m_reader.bytes().size() - 1);
if (!navigate_to_before_eof_marker())
return {};
return false;
if (!navigate_to_after_startxref())
return {};
return false;
if (m_reader.done())
return {};
return false;
m_reader.set_reading_forwards();
auto xref_offset_value = parse_number();
if (!xref_offset_value.is_int())
return {};
return false;
auto xref_offset = xref_offset_value.as_int();
m_reader.move_to(xref_offset);
auto xref_table = parse_xref_table();
if (!xref_table.has_value())
return {};
return false;
auto trailer = parse_file_trailer();
if (!trailer)
return {};
return false;
return XRefTableAndTrailer { xref_table.value(), trailer.release_nonnull() };
m_xref_table = xref_table.value();
m_trailer = trailer;
return true;
}
RefPtr<IndirectValue> Parser::parse_indirect_value_at_offset(size_t offset)
Value Parser::parse_object_with_index(u32 index)
{
m_reader.set_reading_forwards();
m_reader.move_to(offset);
return parse_indirect_value();
VERIFY(m_xref_table.has_object(index));
auto byte_offset = m_xref_table.byte_offset_for_object(index);
m_reader.move_to(byte_offset);
auto indirect_value = parse_indirect_value();
VERIFY(indirect_value);
VERIFY(indirect_value->index() == index);
return indirect_value->value();
}
bool Parser::parse_header()
@ -647,11 +651,14 @@ RefPtr<DictObject> Parser::parse_dict()
return make_object<DictObject>(map);
}
RefPtr<DictObject> Parser::conditionally_parse_page_tree_node_at_offset(size_t offset, bool& ok)
RefPtr<DictObject> Parser::conditionally_parse_page_tree_node(u32 object_index, bool& ok)
{
ok = true;
m_reader.move_to(offset);
VERIFY(m_xref_table.has_object(object_index));
auto byte_offset = m_xref_table.byte_offset_for_object(object_index);
m_reader.move_to(byte_offset);
parse_number();
parse_number();
if (!m_reader.matches("obj")) {

View file

@ -22,19 +22,19 @@ public:
Parser(Badge<Document>, const ReadonlyBytes&);
void set_document(RefPtr<Document> document) { m_document = document; }
[[nodiscard]] ALWAYS_INLINE const RefPtr<DictObject>& trailer() const { return m_trailer; }
void set_document(const RefPtr<Document>& document) { m_document = document; }
bool perform_validation();
// Parses the header and initializes the xref table and trailer
bool initialize();
struct XRefTableAndTrailer {
XRefTable xref_table;
NonnullRefPtr<DictObject> trailer;
};
Optional<XRefTableAndTrailer> parse_last_xref_table_and_trailer();
Value parse_object_with_index(u32 index);
RefPtr<IndirectValue> parse_indirect_value_at_offset(size_t offset);
RefPtr<DictObject> conditionally_parse_page_tree_node_at_offset(size_t offset, bool& ok);
// Specialized version of parse_dict which aborts early if the dict being parsed
// is not a page object. A null RefPtr return indicates that the dict at this index
// is not a page tree node, whereas ok == false indicates a malformed PDF file and
// should cause an abort of the current operation.
RefPtr<DictObject> conditionally_parse_page_tree_node(u32 object_index, bool& ok);
private:
explicit Parser(const ReadonlyBytes&);
@ -85,6 +85,8 @@ private:
Reader m_reader;
RefPtr<Document> m_document;
XRefTable m_xref_table;
RefPtr<DictObject> m_trailer;
};
}