LibPDF: Parse nested Page Tree structures

We now follow nested page tree nodes to find all of the actual
page dicts, whereas previously we just assumed the root level
page tree node contained all of the page children directly.
This commit is contained in:
Matthew Olsson 2021-05-02 18:53:07 -07:00 committed by Andreas Kling
parent 8c745ad0d9
commit 3aeaceb727
Notes: sideshowbarker 2024-07-18 18:23:45 +09:00
4 changed files with 77 additions and 5 deletions

View file

@ -117,14 +117,33 @@ UnwrappedValueType<T> Document::resolve_to(const Value& value)
void Document::build_page_tree()
{
auto page_tree = m_catalog->get_dict(this, "Pages");
auto kids_array = page_tree->get_array(this, "Kids");
add_page_tree_node_to_page_tree(page_tree);
}
void Document::add_page_tree_node_to_page_tree(NonnullRefPtr<DictObject> page_tree)
{
auto kids_array = page_tree->get_array(this, "Kids");
auto page_count = page_tree->get("Count").value().as_int();
if (static_cast<size_t>(page_count) != kids_array->elements().size()) {
// FIXME: Support recursive PDF page tree structures
VERIFY_NOT_REACHED();
// This page tree contains child page trees, so we recursively add
// these pages to the overall page tree
for (auto& value : *kids_array) {
auto reference = resolve_to<IndirectValueRef>(value);
auto byte_offset = m_xref_table.byte_offset_for_object(reference->index());
auto maybe_page_tree_node = m_parser.conditionally_parse_page_tree_node_at_offset(byte_offset);
if (maybe_page_tree_node) {
add_page_tree_node_to_page_tree(maybe_page_tree_node.release_nonnull());
} else {
m_page_object_indices.append(reference->index());
}
}
return;
}
// We know all of the kids are leaf nodes
for (auto& value : *kids_array) {
auto reference = resolve_to<IndirectValueRef>(value);
m_page_object_indices.append(reference->index());

View file

@ -66,7 +66,14 @@ public:
UnwrappedValueType<T> resolve_to(const Value& value);
private:
// FIXME: Currently, to improve performance, we don't load any pages at Document
// construction, rather we just load the page structure and populate
// m_page_object_indices. However, we can be even lazier and defer page tree node
// parsing, as good PDF writers will layout the page tree in a balanced tree to
// improve lookup time. This would reduce the initial overhead by not loading
// every page tree node of, say, a 1000+ page PDF file.
void build_page_tree();
void add_page_tree_node_to_page_tree(NonnullRefPtr<DictObject> page_tree);
Parser m_parser;
XRefTable m_xref_table;

View file

@ -363,14 +363,14 @@ Value Parser::parse_number()
}
}
consume_whitespace();
auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
float f = strtof(string.characters(), nullptr);
if (is_float)
return Value(f);
VERIFY(floorf(f) == f);
consume_whitespace();
return Value(static_cast<int>(f));
}
@ -567,6 +567,50 @@ NonnullRefPtr<DictObject> Parser::parse_dict()
return make_object<DictObject>(map);
}
RefPtr<DictObject> Parser::conditionally_parse_page_tree_node_at_offset(size_t offset)
{
m_reader.move_to(offset);
parse_number();
parse_number();
VERIFY(m_reader.matches("obj"));
m_reader.move_by(3);
consume_whitespace();
consume('<');
consume('<');
consume_whitespace();
HashMap<FlyString, Value> map;
while (true) {
if (m_reader.matches(">>"))
break;
auto name = parse_name();
auto name_string = name->name();
if (!name_string.is_one_of("Type", "Parent", "Kids", "Count")) {
// This is a page, not a page tree node
return {};
}
auto value = parse_value();
if (name_string == "Type") {
if (!value.is_object())
return {};
auto type_object = value.as_object();
if (!type_object->is_name())
return {};
auto type_name = object_cast<NameObject>(type_object);
if (type_name->name() != "Pages")
return {};
}
map.set(name->name(), value);
}
consume('>');
consume('>');
consume_whitespace();
return make_object<DictObject>(map);
}
NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
{
VERIFY(m_reader.matches("stream"));

View file

@ -29,6 +29,8 @@ public:
NonnullRefPtr<IndirectValue> parse_indirect_value_at_offset(size_t offset);
RefPtr<DictObject> conditionally_parse_page_tree_node_at_offset(size_t offset);
private:
bool parse_header();
XRefTable parse_xref_table();