summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Userland/Libraries/LibPDF/CMakeLists.txt1
-rw-r--r--Userland/Libraries/LibPDF/Document.cpp4
-rw-r--r--Userland/Libraries/LibPDF/Document.h6
-rw-r--r--Userland/Libraries/LibPDF/DocumentParser.cpp631
-rw-r--r--Userland/Libraries/LibPDF/DocumentParser.h96
-rw-r--r--Userland/Libraries/LibPDF/Parser.cpp626
-rw-r--r--Userland/Libraries/LibPDF/Parser.h88
7 files changed, 746 insertions, 706 deletions
diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt
index c405f4fb0e..a5a6519012 100644
--- a/Userland/Libraries/LibPDF/CMakeLists.txt
+++ b/Userland/Libraries/LibPDF/CMakeLists.txt
@@ -2,6 +2,7 @@ set(SOURCES
ColorSpace.cpp
CommonNames.cpp
Document.cpp
+ DocumentParser.cpp
Encoding.cpp
Encryption.cpp
Filter.cpp
diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp
index ebe95e1541..cf6faa0e10 100644
--- a/Userland/Libraries/LibPDF/Document.cpp
+++ b/Userland/Libraries/LibPDF/Document.cpp
@@ -36,7 +36,7 @@ String OutlineItem::to_string(int indent) const
PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
{
- auto parser = adopt_ref(*new Parser({}, bytes));
+ auto parser = adopt_ref(*new DocumentParser({}, bytes));
auto document = adopt_ref(*new Document(parser));
TRY(parser->initialize());
@@ -57,7 +57,7 @@ PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
return document;
}
-Document::Document(NonnullRefPtr<Parser> const& parser)
+Document::Document(NonnullRefPtr<DocumentParser> const& parser)
: m_parser(parser)
{
m_parser->set_document(this);
diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h
index fee4b14ccc..4c6d39331f 100644
--- a/Userland/Libraries/LibPDF/Document.h
+++ b/Userland/Libraries/LibPDF/Document.h
@@ -11,10 +11,10 @@
#include <AK/RefCounted.h>
#include <AK/Weakable.h>
#include <LibGfx/Color.h>
+#include <LibPDF/DocumentParser.h>
#include <LibPDF/Encryption.h>
#include <LibPDF/Error.h>
#include <LibPDF/ObjectDerivatives.h>
-#include <LibPDF/Parser.h>
namespace PDF {
@@ -133,7 +133,7 @@ public:
}
private:
- explicit Document(NonnullRefPtr<Parser> const& parser);
+ explicit Document(NonnullRefPtr<DocumentParser> const& parser);
// FIXME: Currently, to improve performance, we don't load any pages at Document
// construction, rather we just load the page structure and populate
@@ -150,7 +150,7 @@ private:
PDFErrorOr<Destination> create_destination_from_parameters(NonnullRefPtr<ArrayObject>);
- NonnullRefPtr<Parser> m_parser;
+ NonnullRefPtr<DocumentParser> m_parser;
RefPtr<DictObject> m_catalog;
RefPtr<DictObject> m_trailer;
Vector<u32> m_page_object_indices;
diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp
new file mode 100644
index 0000000000..e34302928a
--- /dev/null
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@@ -0,0 +1,631 @@
+/*
+ * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/BitStream.h>
+#include <AK/MemoryStream.h>
+#include <LibPDF/CommonNames.h>
+#include <LibPDF/Document.h>
+#include <LibPDF/DocumentParser.h>
+#include <LibPDF/ObjectDerivatives.h>
+
+namespace PDF {
+
+DocumentParser::DocumentParser(Document* document, ReadonlyBytes bytes)
+ : Parser(document, bytes)
+{
+}
+
+PDFErrorOr<void> DocumentParser::initialize()
+{
+ TRY(parse_header());
+
+ auto const linearization_result = TRY(initialize_linearization_dict());
+
+ if (linearization_result == LinearizationResult::NotLinearized)
+ return initialize_non_linearized_xref_table();
+
+ bool is_linearized = m_linearization_dictionary.has_value();
+ if (is_linearized) {
+ // The file may have been linearized at one point, but could have been updated afterwards,
+ // which means it is no longer a linearized PDF file.
+ is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
+
+ if (!is_linearized) {
+ // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
+ // split. This might take some tweaking to ensure correct behavior, which can be
+ // implemented later.
+ TODO();
+ }
+ }
+
+ if (is_linearized)
+ return initialize_linearized_xref_table();
+
+ return initialize_non_linearized_xref_table();
+}
+
+PDFErrorOr<Value> DocumentParser::parse_object_with_index(u32 index)
+{
+ VERIFY(m_xref_table->has_object(index));
+ auto byte_offset = m_xref_table->byte_offset_for_object(index);
+ m_reader.move_to(byte_offset);
+ auto indirect_value = TRY(parse_indirect_value());
+ VERIFY(indirect_value->index() == index);
+ return indirect_value->value();
+}
+
+PDFErrorOr<void> DocumentParser::parse_header()
+{
+ // FIXME: Do something with the version?
+ m_reader.set_reading_forwards();
+ if (m_reader.remaining() == 0)
+ return error("Empty PDF document");
+
+ m_reader.move_to(0);
+ if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
+ return error("Not a PDF document");
+
+ m_reader.move_by(5);
+
+ char major_ver = m_reader.read();
+ if (major_ver != '1' && major_ver != '2')
+ return error(String::formatted("Unknown major version \"{}\"", major_ver));
+
+ if (m_reader.read() != '.')
+ return error("Malformed PDF version");
+
+ char minor_ver = m_reader.read();
+ if (minor_ver < '0' || minor_ver > '7')
+ return error(String::formatted("Unknown minor version \"{}\"", minor_ver));
+
+ m_reader.consume_eol();
+
+ // Parse optional high-byte comment, which signifies a binary file
+ // FIXME: Do something with this?
+ auto comment = parse_comment();
+ if (!comment.is_empty()) {
+ auto binary = comment.length() >= 4;
+ if (binary) {
+ for (size_t i = 0; i < comment.length() && binary; i++)
+ binary = static_cast<u8>(comment[i]) > 128;
+ }
+ }
+
+ return {};
+}
+
+PDFErrorOr<DocumentParser::LinearizationResult> DocumentParser::initialize_linearization_dict()
+{
+ // parse_header() is called immediately before this, so we are at the right location
+ auto indirect_value = Value(*TRY(parse_indirect_value()));
+ auto dict_value = TRY(m_document->resolve(indirect_value));
+ if (!dict_value.has<NonnullRefPtr<Object>>())
+ return error("Expected linearization object to be a dictionary");
+
+ auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
+ if (!dict_object->is<DictObject>())
+ return LinearizationResult::NotLinearized;
+
+ auto dict = dict_object->cast<DictObject>();
+
+ if (!dict->contains(CommonNames::Linearized))
+ return LinearizationResult::NotLinearized;
+
+ if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
+ return error("Malformed linearization dictionary");
+
+ auto length_of_file = dict->get_value(CommonNames::L);
+ auto hint_table = dict->get_value(CommonNames::H);
+ auto first_page_object_number = dict->get_value(CommonNames::O);
+ auto offset_of_first_page_end = dict->get_value(CommonNames::E);
+ auto number_of_pages = dict->get_value(CommonNames::N);
+ auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
+ auto first_page = dict->get(CommonNames::P).value_or({});
+
+ // Validation
+ if (!length_of_file.has_u32()
+ || !hint_table.has<NonnullRefPtr<Object>>()
+ || !first_page_object_number.has_u32()
+ || !number_of_pages.has_u16()
+ || !offset_of_main_xref_table.has_u32()
+ || (!first_page.has<Empty>() && !first_page.has_u32())) {
+ return error("Malformed linearization dictionary parameters");
+ }
+
+ auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>();
+ auto hint_table_size = hint_table_array->size();
+ if (hint_table_size != 2 && hint_table_size != 4)
+ return error("Expected hint table to be of length 2 or 4");
+
+ auto primary_hint_stream_offset = hint_table_array->at(0);
+ auto primary_hint_stream_length = hint_table_array->at(1);
+ Value overflow_hint_stream_offset;
+ Value overflow_hint_stream_length;
+
+ if (hint_table_size == 4) {
+ overflow_hint_stream_offset = hint_table_array->at(2);
+ overflow_hint_stream_length = hint_table_array->at(3);
+ }
+
+ if (!primary_hint_stream_offset.has_u32()
+ || !primary_hint_stream_length.has_u32()
+ || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
+ || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
+ return error("Malformed hint stream");
+ }
+
+ m_linearization_dictionary = LinearizationDictionary {
+ length_of_file.get_u32(),
+ primary_hint_stream_offset.get_u32(),
+ primary_hint_stream_length.get_u32(),
+ overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
+ overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
+ first_page_object_number.get_u32(),
+ offset_of_first_page_end.get_u32(),
+ number_of_pages.get_u16(),
+ offset_of_main_xref_table.get_u32(),
+ first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
+ };
+
+ return LinearizationResult::Linearized;
+}
+
+PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
+{
+ // The linearization parameter dictionary has just been parsed, and the xref table
+ // comes immediately after it. We are in the correct spot.
+ m_xref_table = TRY(parse_xref_table());
+ m_trailer = TRY(parse_file_trailer());
+
+ // Also parse the main xref table and merge into the first-page xref table. Note
+ // that we don't use the main xref table offset from the linearization dict because
+ // for some reason, it specified the offset of the whitespace after the object
+ // index start and length? So it's much easier to do it this way.
+ auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
+ m_reader.move_to(main_xref_table_offset);
+ auto main_xref_table = TRY(parse_xref_table());
+ TRY(m_xref_table->merge(move(*main_xref_table)));
+ return {};
+}
+
+PDFErrorOr<void> DocumentParser::initialize_hint_tables()
+{
+ auto linearization_dict = m_linearization_dictionary.value();
+ auto primary_offset = linearization_dict.primary_hint_stream_offset;
+ auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
+
+ auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
+ m_reader.move_to(offset);
+ auto stream_indirect_value = parse_indirect_value();
+ if (stream_indirect_value.is_error())
+ return {};
+
+ auto stream_value = stream_indirect_value.value()->value();
+ if (!stream_value.has<NonnullRefPtr<Object>>())
+ return {};
+
+ auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
+ if (!stream_object->is<StreamObject>())
+ return {};
+
+ return stream_object->cast<StreamObject>();
+ };
+
+ auto primary_hint_stream = parse_hint_table(primary_offset);
+ if (!primary_hint_stream)
+ return error("Invalid primary hint stream");
+
+ RefPtr<StreamObject> overflow_hint_stream;
+ if (overflow_offset != NumericLimits<u32>::max())
+ overflow_hint_stream = parse_hint_table(overflow_offset);
+
+ ByteBuffer possible_merged_stream_buffer;
+ ReadonlyBytes hint_stream_bytes;
+
+ if (overflow_hint_stream) {
+ auto primary_size = primary_hint_stream->bytes().size();
+ auto overflow_size = overflow_hint_stream->bytes().size();
+ auto total_size = primary_size + overflow_size;
+
+ auto buffer_result = ByteBuffer::create_uninitialized(total_size);
+ if (buffer_result.is_error())
+ return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" };
+ possible_merged_stream_buffer = buffer_result.release_value();
+ MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()));
+ MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()));
+ hint_stream_bytes = possible_merged_stream_buffer.bytes();
+ } else {
+ hint_stream_bytes = primary_hint_stream->bytes();
+ }
+
+ auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes));
+ auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes);
+
+ // FIXME: Do something with the hint tables
+ return {};
+}
+
+PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
+{
+ m_reader.move_to(m_reader.bytes().size() - 1);
+ if (!navigate_to_before_eof_marker())
+ return error("No EOF marker");
+ if (!navigate_to_after_startxref())
+ return error("No xref");
+
+ m_reader.set_reading_forwards();
+ auto xref_offset_value = parse_number();
+ if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
+ return error("Invalid xref offset");
+ auto xref_offset = xref_offset_value.value().get<int>();
+
+ m_reader.move_to(xref_offset);
+ m_xref_table = TRY(parse_xref_table());
+ m_trailer = TRY(parse_file_trailer());
+ return {};
+}
+
+PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
+{
+ if (!m_reader.matches("xref"))
+ return error("Expected \"xref\"");
+ m_reader.move_by(4);
+ if (!m_reader.consume_eol())
+ return error("Expected newline after \"xref\"");
+
+ auto table = adopt_ref(*new XRefTable());
+
+ do {
+ if (m_reader.matches("trailer"))
+ return table;
+
+ Vector<XRefEntry> entries;
+
+ auto starting_index_value = TRY(parse_number());
+ auto starting_index = starting_index_value.get<int>();
+ auto object_count_value = TRY(parse_number());
+ auto object_count = object_count_value.get<int>();
+
+ for (int i = 0; i < object_count; i++) {
+ auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
+ m_reader.move_by(10);
+ if (!m_reader.consume(' '))
+ return error("Malformed xref entry");
+
+ auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
+ m_reader.move_by(5);
+ if (!m_reader.consume(' '))
+ return error("Malformed xref entry");
+
+ auto letter = m_reader.read();
+ if (letter != 'n' && letter != 'f')
+ return error("Malformed xref entry");
+
+ // The line ending sequence can be one of the following:
+ // SP CR, SP LF, or CR LF
+ if (m_reader.matches(' ')) {
+ m_reader.consume();
+ auto ch = m_reader.consume();
+ if (ch != '\r' && ch != '\n')
+ return error("Malformed xref entry");
+ } else {
+ if (!m_reader.matches("\r\n"))
+ return error("Malformed xref entry");
+ m_reader.move_by(2);
+ }
+
+ auto offset = strtol(offset_string.characters(), nullptr, 10);
+ auto generation = strtol(generation_string.characters(), nullptr, 10);
+
+ entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
+ }
+
+ table->add_section({ starting_index, object_count, entries });
+ } while (m_reader.matches_number());
+
+ return table;
+}
+
+PDFErrorOr<NonnullRefPtr<DictObject>> DocumentParser::parse_file_trailer()
+{
+ while (m_reader.matches_eol())
+ m_reader.consume_eol();
+
+ if (!m_reader.matches("trailer"))
+ return error("Expected \"trailer\" keyword");
+ m_reader.move_by(7);
+ m_reader.consume_whitespace();
+ auto dict = TRY(parse_dict());
+
+ if (!m_reader.matches("startxref"))
+ return error("Expected \"startxref\"");
+ m_reader.move_by(9);
+ m_reader.consume_whitespace();
+
+ m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
+ VERIFY(m_reader.consume_eol());
+ if (!m_reader.matches("%%EOF"))
+ return error("Expected \"%%EOF\"");
+
+ m_reader.move_by(5);
+ m_reader.consume_whitespace();
+ return dict;
+}
+
+PDFErrorOr<DocumentParser::PageOffsetHintTable> DocumentParser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
+{
+ if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
+ return error("Hint stream is too small");
+
+ size_t offset = 0;
+
+ auto read_u32 = [&] {
+ u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
+ offset += 4;
+ return AK::convert_between_host_and_big_endian(data);
+ };
+
+ auto read_u16 = [&] {
+ u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
+ offset += 2;
+ return AK::convert_between_host_and_big_endian(data);
+ };
+
+ PageOffsetHintTable hint_table {
+ read_u32(),
+ read_u32(),
+ read_u16(),
+ read_u32(),
+ read_u16(),
+ read_u32(),
+ read_u16(),
+ read_u32(),
+ read_u16(),
+ read_u16(),
+ read_u16(),
+ read_u16(),
+ read_u16(),
+ };
+
+ // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
+ // fields in PageOffsetHintTableEntry are u32
+ VERIFY(hint_table.bits_required_for_object_number <= 32);
+ VERIFY(hint_table.bits_required_for_page_length <= 32);
+ VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
+ VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
+ VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
+ VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
+ VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
+
+ return hint_table;
+}
+
+Vector<DocumentParser::PageOffsetHintTableEntry> DocumentParser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
+{
+ InputMemoryStream input_stream(hint_stream_bytes);
+ input_stream.seek(sizeof(PageOffsetHintTable));
+
+ InputBitStream bit_stream(input_stream);
+
+ auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
+ Vector<PageOffsetHintTableEntry> entries;
+ for (size_t i = 0; i < number_of_pages; i++)
+ entries.append(PageOffsetHintTableEntry {});
+
+ auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
+ auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
+ auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
+ auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
+ auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
+ auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
+ auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
+
+ auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
+ if (bit_size <= 0)
+ return;
+
+ for (int i = 0; i < number_of_pages; i++) {
+ auto& entry = entries[i];
+ entry.*field = bit_stream.read_bits(bit_size);
+ }
+ };
+
+ auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
+ if (bit_size <= 0)
+ return;
+
+ for (int page = 1; page < number_of_pages; page++) {
+ auto number_of_shared_objects = entries[page].number_of_shared_objects;
+ Vector<u32> items;
+ items.ensure_capacity(number_of_shared_objects);
+
+ for (size_t i = 0; i < number_of_shared_objects; i++)
+ items.unchecked_append(bit_stream.read_bits(bit_size));
+
+ entries[page].*field = move(items);
+ }
+ };
+
+ parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
+ parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
+ parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
+ parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
+ parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
+ parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
+ parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
+
+ return entries;
+}
+
+bool DocumentParser::navigate_to_before_eof_marker()
+{
+ m_reader.set_reading_backwards();
+
+ while (!m_reader.done()) {
+ m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
+ if (m_reader.done())
+ return false;
+
+ m_reader.consume_eol();
+ if (!m_reader.matches("%%EOF"))
+ continue;
+
+ m_reader.move_by(5);
+ if (!m_reader.matches_eol())
+ continue;
+ m_reader.consume_eol();
+ return true;
+ }
+
+ return false;
+}
+
+bool DocumentParser::navigate_to_after_startxref()
+{
+ m_reader.set_reading_backwards();
+
+ while (!m_reader.done()) {
+ m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
+ auto offset = m_reader.offset() + 1;
+
+ m_reader.consume_eol();
+ if (!m_reader.matches("startxref"))
+ continue;
+
+ m_reader.move_by(9);
+ if (!m_reader.matches_eol())
+ continue;
+
+ m_reader.move_to(offset);
+ return true;
+ }
+
+ return false;
+}
+
+PDFErrorOr<RefPtr<DictObject>> DocumentParser::conditionally_parse_page_tree_node(u32 object_index)
+{
+ VERIFY(m_xref_table->has_object(object_index));
+ auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
+
+ m_reader.move_to(byte_offset);
+ TRY(parse_number());
+ TRY(parse_number());
+ if (!m_reader.matches("obj"))
+ return error(String::formatted("Invalid page tree offset {}", object_index));
+
+ m_reader.move_by(3);
+ m_reader.consume_whitespace();
+
+ VERIFY(m_reader.consume('<') && m_reader.consume('<'));
+
+ m_reader.consume_whitespace();
+ HashMap<FlyString, Value> map;
+
+ while (true) {
+ if (m_reader.matches(">>"))
+ break;
+ auto name = TRY(parse_name());
+ auto name_string = name->name();
+ if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
+ // This is a page, not a page tree node
+ return RefPtr<DictObject> {};
+ }
+
+ auto value = TRY(parse_value());
+ if (name_string == CommonNames::Type) {
+ if (!value.has<NonnullRefPtr<Object>>())
+ return RefPtr<DictObject> {};
+ auto type_object = value.get<NonnullRefPtr<Object>>();
+ if (!type_object->is<NameObject>())
+ return RefPtr<DictObject> {};
+ auto type_name = type_object->cast<NameObject>();
+ if (type_name->name() != CommonNames::Pages)
+ return RefPtr<DictObject> {};
+ }
+ map.set(name->name(), value);
+ }
+
+ VERIFY(m_reader.consume('>') && m_reader.consume('>'));
+ m_reader.consume_whitespace();
+
+ return make_object<DictObject>(map);
+}
+
+}
+
+namespace AK {
+
+template<>
+struct Formatter<PDF::DocumentParser::LinearizationDictionary> : Formatter<StringView> {
+ ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::LinearizationDictionary const& dict)
+ {
+ StringBuilder builder;
+ builder.append("{\n"sv);
+ builder.appendff(" length_of_file={}\n", dict.length_of_file);
+ builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
+ builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
+ builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
+ builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
+ builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
+ builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
+ builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
+ builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
+ builder.appendff(" first_page={}\n", dict.first_page);
+ builder.append('}');
+ return Formatter<StringView>::format(format_builder, builder.to_string());
+ }
+};
+
+template<>
+struct Formatter<PDF::DocumentParser::PageOffsetHintTable> : Formatter<StringView> {
+ ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTable const& table)
+ {
+ StringBuilder builder;
+ builder.append("{\n"sv);
+ builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
+ builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
+ builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
+ builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
+ builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
+ builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
+ builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
+ builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
+ builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
+ builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
+ builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
+ builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
+ builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
+ builder.append('}');
+ return Formatter<StringView>::format(format_builder, builder.to_string());
+ }
+};
+
+template<>
+struct Formatter<PDF::DocumentParser::PageOffsetHintTableEntry> : Formatter<StringView> {
+ ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTableEntry const& entry)
+ {
+ StringBuilder builder;
+ builder.append("{\n"sv);
+ builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
+ builder.appendff(" page_length_number={}\n", entry.page_length_number);
+ builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
+ builder.append(" shared_object_identifiers=["sv);
+ for (auto& identifier : entry.shared_object_identifiers)
+ builder.appendff(" {}", identifier);
+ builder.append(" ]\n"sv);
+ builder.append(" shared_object_location_numerators=["sv);
+ for (auto& numerator : entry.shared_object_location_numerators)
+ builder.appendff(" {}", numerator);
+ builder.append(" ]\n"sv);
+ builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
+ builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
+ builder.append('}');
+ return Formatter<StringView>::format(format_builder, builder.to_string());
+ }
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h
new file mode 100644
index 0000000000..fef5f40ad4
--- /dev/null
+++ b/Userland/Libraries/LibPDF/DocumentParser.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <LibPDF/Parser.h>
+
+namespace PDF {
+
+class DocumentParser final : public RefCounted<DocumentParser>
+ , public Parser {
+public:
+ DocumentParser(Document*, ReadonlyBytes);
+
+ enum class LinearizationResult {
+ NotLinearized,
+ Linearized,
+ };
+
+ [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
+
+ // Parses the header and initializes the xref table and trailer
+ PDFErrorOr<void> initialize();
+
+ PDFErrorOr<Value> parse_object_with_index(u32 index);
+
+ // Specialized version of parse_dict which aborts early if the dict being parsed
+ // is not a page object
+ PDFErrorOr<RefPtr<DictObject>> conditionally_parse_page_tree_node(u32 object_index);
+
+private:
+ struct LinearizationDictionary {
+ u32 length_of_file { 0 };
+ u32 primary_hint_stream_offset { 0 };
+ u32 primary_hint_stream_length { 0 };
+ u32 overflow_hint_stream_offset { 0 };
+ u32 overflow_hint_stream_length { 0 };
+ u32 first_page_object_number { 0 };
+ u32 offset_of_first_page_end { 0 };
+ u16 number_of_pages { 0 };
+ u32 offset_of_main_xref_table { 0 };
+ u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
+ };
+
+ struct PageOffsetHintTable {
+ u32 least_number_of_objects_in_a_page { 0 };
+ u32 location_of_first_page_object { 0 };
+ u16 bits_required_for_object_number { 0 };
+ u32 least_length_of_a_page { 0 };
+ u16 bits_required_for_page_length { 0 };
+ u32 least_offset_of_any_content_stream { 0 };
+ u16 bits_required_for_content_stream_offsets { 0 };
+ u32 least_content_stream_length { 0 };
+ u16 bits_required_for_content_stream_length { 0 };
+ u16 bits_required_for_number_of_shared_obj_refs { 0 };
+ u16 bits_required_for_greatest_shared_obj_identifier { 0 };
+ u16 bits_required_for_fraction_numerator { 0 };
+ u16 shared_object_reference_fraction_denominator { 0 };
+ };
+
+ struct PageOffsetHintTableEntry {
+ u32 objects_in_page_number { 0 };
+ u32 page_length_number { 0 };
+ u32 number_of_shared_objects { 0 };
+ Vector<u32> shared_object_identifiers {};
+ Vector<u32> shared_object_location_numerators {};
+ u32 page_content_stream_offset_number { 0 };
+ u32 page_content_stream_length_number { 0 };
+ };
+
+ friend struct AK::Formatter<LinearizationDictionary>;
+ friend struct AK::Formatter<PageOffsetHintTable>;
+ friend struct AK::Formatter<PageOffsetHintTableEntry>;
+
+ PDFErrorOr<void> parse_header();
+ PDFErrorOr<LinearizationResult> initialize_linearization_dict();
+ PDFErrorOr<void> initialize_linearized_xref_table();
+ PDFErrorOr<void> initialize_non_linearized_xref_table();
+ PDFErrorOr<void> initialize_hint_tables();
+ PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
+ Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);
+ PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table();
+ PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer();
+
+ bool navigate_to_before_eof_marker();
+ bool navigate_to_after_startxref();
+
+ RefPtr<XRefTable> m_xref_table;
+ RefPtr<DictObject> m_trailer;
+ Optional<LinearizationDictionary> m_linearization_dictionary;
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
index 36b018d89c..4df9d59f6e 100644
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -4,8 +4,6 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
-#include <AK/BitStream.h>
-#include <AK/MemoryStream.h>
#include <AK/ScopeGuard.h>
#include <LibPDF/CommonNames.h>
#include <LibPDF/Document.h>
@@ -16,17 +14,11 @@
namespace PDF {
-template<typename T, typename... Args>
-static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
-{
- return adopt_ref(*new T(forward<Args>(args)...));
-}
-
PDFErrorOr<Vector<Operator>> Parser::parse_operators(Document* document, ReadonlyBytes bytes)
{
- auto parser = adopt_ref(*new Parser(document, bytes));
- parser->m_disable_encryption = true;
- return parser->parse_operators();
+ Parser parser(document, bytes);
+ parser.m_disable_encryption = true;
+ return parser.parse_operators();
}
Parser::Parser(Document* document, ReadonlyBytes bytes)
@@ -45,494 +37,6 @@ void Parser::set_document(WeakPtr<Document> const& document)
m_document = document;
}
-PDFErrorOr<void> Parser::initialize()
-{
- TRY(parse_header());
-
- auto const linearization_result = TRY(initialize_linearization_dict());
-
- if (linearization_result == LinearizationResult::NotLinearized)
- return initialize_non_linearized_xref_table();
-
- bool is_linearized = m_linearization_dictionary.has_value();
- if (is_linearized) {
- // The file may have been linearized at one point, but could have been updated afterwards,
- // which means it is no longer a linearized PDF file.
- is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
-
- if (!is_linearized) {
- // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
- // split. This might take some tweaking to ensure correct behavior, which can be
- // implemented later.
- TODO();
- }
- }
-
- if (is_linearized)
- return initialize_linearized_xref_table();
-
- return initialize_non_linearized_xref_table();
-}
-
-PDFErrorOr<Value> Parser::parse_object_with_index(u32 index)
-{
- VERIFY(m_xref_table->has_object(index));
- auto byte_offset = m_xref_table->byte_offset_for_object(index);
- m_reader.move_to(byte_offset);
- auto indirect_value = TRY(parse_indirect_value());
- VERIFY(indirect_value->index() == index);
- return indirect_value->value();
-}
-
-PDFErrorOr<void> Parser::parse_header()
-{
- // FIXME: Do something with the version?
- m_reader.set_reading_forwards();
- if (m_reader.remaining() == 0)
- return error("Empty PDF document");
-
- m_reader.move_to(0);
- if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
- return error("Not a PDF document");
-
- m_reader.move_by(5);
-
- char major_ver = m_reader.read();
- if (major_ver != '1' && major_ver != '2')
- return error(String::formatted("Unknown major version \"{}\"", major_ver));
-
- if (m_reader.read() != '.')
- return error("Malformed PDF version");
-
- char minor_ver = m_reader.read();
- if (minor_ver < '0' || minor_ver > '7')
- return error(String::formatted("Unknown minor version \"{}\"", minor_ver));
-
- m_reader.consume_eol();
-
- // Parse optional high-byte comment, which signifies a binary file
- // FIXME: Do something with this?
- auto comment = parse_comment();
- if (!comment.is_empty()) {
- auto binary = comment.length() >= 4;
- if (binary) {
- for (size_t i = 0; i < comment.length() && binary; i++)
- binary = static_cast<u8>(comment[i]) > 128;
- }
- }
-
- return {};
-}
-
-PDFErrorOr<Parser::LinearizationResult> Parser::initialize_linearization_dict()
-{
- // parse_header() is called immediately before this, so we are at the right location
- auto indirect_value = Value(*TRY(parse_indirect_value()));
- auto dict_value = TRY(m_document->resolve(indirect_value));
- if (!dict_value.has<NonnullRefPtr<Object>>())
- return error("Expected linearization object to be a dictionary");
-
- auto dict_object = dict_value.get<NonnullRefPtr<Object>>();
- if (!dict_object->is<DictObject>())
- return LinearizationResult::NotLinearized;
-
- auto dict = dict_object->cast<DictObject>();
-
- if (!dict->contains(CommonNames::Linearized))
- return LinearizationResult::NotLinearized;
-
- if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
- return error("Malformed linearization dictionary");
-
- auto length_of_file = dict->get_value(CommonNames::L);
- auto hint_table = dict->get_value(CommonNames::H);
- auto first_page_object_number = dict->get_value(CommonNames::O);
- auto offset_of_first_page_end = dict->get_value(CommonNames::E);
- auto number_of_pages = dict->get_value(CommonNames::N);
- auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
- auto first_page = dict->get(CommonNames::P).value_or({});
-
- // Validation
- if (!length_of_file.has_u32()
- || !hint_table.has<NonnullRefPtr<Object>>()
- || !first_page_object_number.has_u32()
- || !number_of_pages.has_u16()
- || !offset_of_main_xref_table.has_u32()
- || (!first_page.has<Empty>() && !first_page.has_u32())) {
- return error("Malformed linearization dictionary parameters");
- }
-
- auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>();
- auto hint_table_size = hint_table_array->size();
- if (hint_table_size != 2 && hint_table_size != 4)
- return error("Expected hint table to be of length 2 or 4");
-
- auto primary_hint_stream_offset = hint_table_array->at(0);
- auto primary_hint_stream_length = hint_table_array->at(1);
- Value overflow_hint_stream_offset;
- Value overflow_hint_stream_length;
-
- if (hint_table_size == 4) {
- overflow_hint_stream_offset = hint_table_array->at(2);
- overflow_hint_stream_length = hint_table_array->at(3);
- }
-
- if (!primary_hint_stream_offset.has_u32()
- || !primary_hint_stream_length.has_u32()
- || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32())
- || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) {
- return error("Malformed hint stream");
- }
-
- m_linearization_dictionary = LinearizationDictionary {
- length_of_file.get_u32(),
- primary_hint_stream_offset.get_u32(),
- primary_hint_stream_length.get_u32(),
- overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(),
- overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(),
- first_page_object_number.get_u32(),
- offset_of_first_page_end.get_u32(),
- number_of_pages.get_u16(),
- offset_of_main_xref_table.get_u32(),
- first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(),
- };
-
- return LinearizationResult::Linearized;
-}
-
-PDFErrorOr<void> Parser::initialize_linearized_xref_table()
-{
- // The linearization parameter dictionary has just been parsed, and the xref table
- // comes immediately after it. We are in the correct spot.
- m_xref_table = TRY(parse_xref_table());
- m_trailer = TRY(parse_file_trailer());
-
- // Also parse the main xref table and merge into the first-page xref table. Note
- // that we don't use the main xref table offset from the linearization dict because
- // for some reason, it specified the offset of the whitespace after the object
- // index start and length? So it's much easier to do it this way.
- auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
- m_reader.move_to(main_xref_table_offset);
- auto main_xref_table = TRY(parse_xref_table());
- TRY(m_xref_table->merge(move(*main_xref_table)));
- return {};
-}
-
-PDFErrorOr<void> Parser::initialize_hint_tables()
-{
- auto linearization_dict = m_linearization_dictionary.value();
- auto primary_offset = linearization_dict.primary_hint_stream_offset;
- auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
-
- auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
- m_reader.move_to(offset);
- auto stream_indirect_value = parse_indirect_value();
- if (stream_indirect_value.is_error())
- return {};
-
- auto stream_value = stream_indirect_value.value()->value();
- if (!stream_value.has<NonnullRefPtr<Object>>())
- return {};
-
- auto stream_object = stream_value.get<NonnullRefPtr<Object>>();
- if (!stream_object->is<StreamObject>())
- return {};
-
- return stream_object->cast<StreamObject>();
- };
-
- auto primary_hint_stream = parse_hint_table(primary_offset);
- if (!primary_hint_stream)
- return error("Invalid primary hint stream");
-
- RefPtr<StreamObject> overflow_hint_stream;
- if (overflow_offset != NumericLimits<u32>::max())
- overflow_hint_stream = parse_hint_table(overflow_offset);
-
- ByteBuffer possible_merged_stream_buffer;
- ReadonlyBytes hint_stream_bytes;
-
- if (overflow_hint_stream) {
- auto primary_size = primary_hint_stream->bytes().size();
- auto overflow_size = overflow_hint_stream->bytes().size();
- auto total_size = primary_size + overflow_size;
-
- auto buffer_result = ByteBuffer::create_uninitialized(total_size);
- if (buffer_result.is_error())
- return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" };
- possible_merged_stream_buffer = buffer_result.release_value();
- MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes()));
- MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes()));
- hint_stream_bytes = possible_merged_stream_buffer.bytes();
- } else {
- hint_stream_bytes = primary_hint_stream->bytes();
- }
-
- auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes));
- auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes);
-
- // FIXME: Do something with the hint tables
- return {};
-}
-
-PDFErrorOr<void> Parser::initialize_non_linearized_xref_table()
-{
- m_reader.move_to(m_reader.bytes().size() - 1);
- if (!navigate_to_before_eof_marker())
- return error("No EOF marker");
- if (!navigate_to_after_startxref())
- return error("No xref");
-
- m_reader.set_reading_forwards();
- auto xref_offset_value = parse_number();
- if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
- return error("Invalid xref offset");
- auto xref_offset = xref_offset_value.value().get<int>();
-
- m_reader.move_to(xref_offset);
- m_xref_table = TRY(parse_xref_table());
- m_trailer = TRY(parse_file_trailer());
- return {};
-}
-
-PDFErrorOr<NonnullRefPtr<XRefTable>> Parser::parse_xref_table()
-{
- if (!m_reader.matches("xref"))
- return error("Expected \"xref\"");
- m_reader.move_by(4);
- if (!m_reader.consume_eol())
- return error("Expected newline after \"xref\"");
-
- auto table = adopt_ref(*new XRefTable());
-
- do {
- if (m_reader.matches("trailer"))
- return table;
-
- Vector<XRefEntry> entries;
-
- auto starting_index_value = TRY(parse_number());
- auto starting_index = starting_index_value.get<int>();
- auto object_count_value = TRY(parse_number());
- auto object_count = object_count_value.get<int>();
-
- for (int i = 0; i < object_count; i++) {
- auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
- m_reader.move_by(10);
- if (!m_reader.consume(' '))
- return error("Malformed xref entry");
-
- auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
- m_reader.move_by(5);
- if (!m_reader.consume(' '))
- return error("Malformed xref entry");
-
- auto letter = m_reader.read();
- if (letter != 'n' && letter != 'f')
- return error("Malformed xref entry");
-
- // The line ending sequence can be one of the following:
- // SP CR, SP LF, or CR LF
- if (m_reader.matches(' ')) {
- m_reader.consume();
- auto ch = m_reader.consume();
- if (ch != '\r' && ch != '\n')
- return error("Malformed xref entry");
- } else {
- if (!m_reader.matches("\r\n"))
- return error("Malformed xref entry");
- m_reader.move_by(2);
- }
-
- auto offset = strtol(offset_string.characters(), nullptr, 10);
- auto generation = strtol(generation_string.characters(), nullptr, 10);
-
- entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
- }
-
- table->add_section({ starting_index, object_count, entries });
- } while (m_reader.matches_number());
-
- return table;
-}
-
-PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_file_trailer()
-{
- while (m_reader.matches_eol())
- m_reader.consume_eol();
-
- if (!m_reader.matches("trailer"))
- return error("Expected \"trailer\" keyword");
- m_reader.move_by(7);
- m_reader.consume_whitespace();
- auto dict = TRY(parse_dict());
-
- if (!m_reader.matches("startxref"))
- return error("Expected \"startxref\"");
- m_reader.move_by(9);
- m_reader.consume_whitespace();
-
- m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
- VERIFY(m_reader.consume_eol());
- if (!m_reader.matches("%%EOF"))
- return error("Expected \"%%EOF\"");
-
- m_reader.move_by(5);
- m_reader.consume_whitespace();
- return dict;
-}
-
-PDFErrorOr<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes)
-{
- if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
- return error("Hint stream is too small");
-
- size_t offset = 0;
-
- auto read_u32 = [&] {
- u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
- offset += 4;
- return AK::convert_between_host_and_big_endian(data);
- };
-
- auto read_u16 = [&] {
- u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
- offset += 2;
- return AK::convert_between_host_and_big_endian(data);
- };
-
- PageOffsetHintTable hint_table {
- read_u32(),
- read_u32(),
- read_u16(),
- read_u32(),
- read_u16(),
- read_u32(),
- read_u16(),
- read_u32(),
- read_u16(),
- read_u16(),
- read_u16(),
- read_u16(),
- read_u16(),
- };
-
- // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
- // fields in PageOffsetHintTableEntry are u32
- VERIFY(hint_table.bits_required_for_object_number <= 32);
- VERIFY(hint_table.bits_required_for_page_length <= 32);
- VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
- VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
- VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
- VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
- VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
-
- return hint_table;
-}
-
-Vector<Parser::PageOffsetHintTableEntry> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes)
-{
- InputMemoryStream input_stream(hint_stream_bytes);
- input_stream.seek(sizeof(PageOffsetHintTable));
-
- InputBitStream bit_stream(input_stream);
-
- auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
- Vector<PageOffsetHintTableEntry> entries;
- for (size_t i = 0; i < number_of_pages; i++)
- entries.append(PageOffsetHintTableEntry {});
-
- auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
- auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
- auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
- auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
- auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
- auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
- auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
-
- auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
- if (bit_size <= 0)
- return;
-
- for (int i = 0; i < number_of_pages; i++) {
- auto& entry = entries[i];
- entry.*field = bit_stream.read_bits(bit_size);
- }
- };
-
- auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
- if (bit_size <= 0)
- return;
-
- for (int page = 1; page < number_of_pages; page++) {
- auto number_of_shared_objects = entries[page].number_of_shared_objects;
- Vector<u32> items;
- items.ensure_capacity(number_of_shared_objects);
-
- for (size_t i = 0; i < number_of_shared_objects; i++)
- items.unchecked_append(bit_stream.read_bits(bit_size));
-
- entries[page].*field = move(items);
- }
- };
-
- parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
- parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
- parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
- parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
- parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
- parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
- parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
-
- return entries;
-}
-
-bool Parser::navigate_to_before_eof_marker()
-{
- m_reader.set_reading_backwards();
-
- while (!m_reader.done()) {
- m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
- if (m_reader.done())
- return false;
-
- m_reader.consume_eol();
- if (!m_reader.matches("%%EOF"))
- continue;
-
- m_reader.move_by(5);
- if (!m_reader.matches_eol())
- continue;
- m_reader.consume_eol();
- return true;
- }
-
- return false;
-}
-
-bool Parser::navigate_to_after_startxref()
-{
- m_reader.set_reading_backwards();
-
- while (!m_reader.done()) {
- m_reader.move_until([&](auto) { return m_reader.matches_eol(); });
- auto offset = m_reader.offset() + 1;
-
- m_reader.consume_eol();
- if (!m_reader.matches("startxref"))
- continue;
-
- m_reader.move_by(9);
- if (!m_reader.matches_eol())
- continue;
-
- m_reader.move_to(offset);
- return true;
- }
-
- return false;
-}
-
String Parser::parse_comment()
{
if (!m_reader.matches('%'))
@@ -924,55 +428,6 @@ PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_dict()
return make_object<DictObject>(map);
}
-PDFErrorOr<RefPtr<DictObject>> Parser::conditionally_parse_page_tree_node(u32 object_index)
-{
- VERIFY(m_xref_table->has_object(object_index));
- auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
-
- m_reader.move_to(byte_offset);
- TRY(parse_number());
- TRY(parse_number());
- if (!m_reader.matches("obj"))
- return error(String::formatted("Invalid page tree offset {}", object_index));
-
- m_reader.move_by(3);
- m_reader.consume_whitespace();
-
- VERIFY(m_reader.consume('<') && m_reader.consume('<'));
-
- m_reader.consume_whitespace();
- HashMap<FlyString, Value> map;
-
- while (true) {
- if (m_reader.matches(">>"))
- break;
- auto name = TRY(parse_name());
- auto name_string = name->name();
- if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) {
- // This is a page, not a page tree node
- return RefPtr<DictObject> {};
- }
-
- auto value = TRY(parse_value());
- if (name_string == CommonNames::Type) {
- if (!value.has<NonnullRefPtr<Object>>())
- return RefPtr<DictObject> {};
- auto type_object = value.get<NonnullRefPtr<Object>>();
- if (!type_object->is<NameObject>())
- return RefPtr<DictObject> {};
- auto type_name = type_object->cast<NameObject>();
- if (type_name->name() != CommonNames::Pages)
- return RefPtr<DictObject> {};
- }
- map.set(name->name(), value);
- }
-
- VERIFY(m_reader.consume('>') && m_reader.consume('>'));
- m_reader.consume_whitespace();
-
- return make_object<DictObject>(map);
-}
-
PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
{
if (!m_reader.matches("stream"))
@@ -984,7 +439,7 @@ PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictO
ReadonlyBytes bytes;
auto maybe_length = dict->get(CommonNames::Length);
- if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) {
+ if (maybe_length.has_value() && (!maybe_length->has<Reference>())) {
// The PDF writer has kindly provided us with the direct length of the stream
m_reader.save();
auto length = TRY(m_document->resolve_to<int>(maybe_length.value()));
@@ -1080,76 +535,3 @@ Error Parser::error(
}
}
-
-namespace AK {
-
-template<>
-struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
- ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict)
- {
- StringBuilder builder;
- builder.append("{\n"sv);
- builder.appendff(" length_of_file={}\n", dict.length_of_file);
- builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
- builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
- builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
- builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
- builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
- builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
- builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
- builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
- builder.appendff(" first_page={}\n", dict.first_page);
- builder.append('}');
- return Formatter<StringView>::format(format_builder, builder.to_string());
- }
-};
-
-template<>
-struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
- ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table)
- {
- StringBuilder builder;
- builder.append("{\n"sv);
- builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
- builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
- builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
- builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
- builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
- builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
- builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
- builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
- builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
- builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
- builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
- builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
- builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
- builder.append('}');
- return Formatter<StringView>::format(format_builder, builder.to_string());
- }
-};
-
-template<>
-struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
- ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry)
- {
- StringBuilder builder;
- builder.append("{\n"sv);
- builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
- builder.appendff(" page_length_number={}\n", entry.page_length_number);
- builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
- builder.append(" shared_object_identifiers=["sv);
- for (auto& identifier : entry.shared_object_identifiers)
- builder.appendff(" {}", identifier);
- builder.append(" ]\n"sv);
- builder.append(" shared_object_location_numerators=["sv);
- for (auto& numerator : entry.shared_object_location_numerators)
- builder.appendff(" {}", numerator);
- builder.append(" ]\n"sv);
- builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
- builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
- builder.append('}');
- return Formatter<StringView>::format(format_builder, builder.to_string());
- }
-};
-
-}
diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h
index 7fcf82478e..26eb6b9079 100644
--- a/Userland/Libraries/LibPDF/Parser.h
+++ b/Userland/Libraries/LibPDF/Parser.h
@@ -16,90 +16,23 @@
namespace PDF {
+template<typename T, typename... Args>
+static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
+{
+ return adopt_ref(*new T(forward<Args>(args)...));
+}
+
class Document;
-class Parser final : public RefCounted<Parser> {
+class Parser {
public:
- enum class LinearizationResult {
- NotLinearized,
- Linearized,
- };
-
static PDFErrorOr<Vector<Operator>> parse_operators(Document*, ReadonlyBytes);
+ Parser(ReadonlyBytes);
Parser(Document*, ReadonlyBytes);
- [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
void set_document(WeakPtr<Document> const&);
- // Parses the header and initializes the xref table and trailer
- PDFErrorOr<void> initialize();
-
- PDFErrorOr<Value> parse_object_with_index(u32 index);
-
- // Specialized version of parse_dict which aborts early if the dict being parsed
- // is not a page object
- PDFErrorOr<RefPtr<DictObject>> conditionally_parse_page_tree_node(u32 object_index);
-
-private:
- struct LinearizationDictionary {
- u32 length_of_file { 0 };
- u32 primary_hint_stream_offset { 0 };
- u32 primary_hint_stream_length { 0 };
- u32 overflow_hint_stream_offset { 0 };
- u32 overflow_hint_stream_length { 0 };
- u32 first_page_object_number { 0 };
- u32 offset_of_first_page_end { 0 };
- u16 number_of_pages { 0 };
- u32 offset_of_main_xref_table { 0 };
- u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
- };
-
- struct PageOffsetHintTable {
- u32 least_number_of_objects_in_a_page { 0 };
- u32 location_of_first_page_object { 0 };
- u16 bits_required_for_object_number { 0 };
- u32 least_length_of_a_page { 0 };
- u16 bits_required_for_page_length { 0 };
- u32 least_offset_of_any_content_stream { 0 };
- u16 bits_required_for_content_stream_offsets { 0 };
- u32 least_content_stream_length { 0 };
- u16 bits_required_for_content_stream_length { 0 };
- u16 bits_required_for_number_of_shared_obj_refs { 0 };
- u16 bits_required_for_greatest_shared_obj_identifier { 0 };
- u16 bits_required_for_fraction_numerator { 0 };
- u16 shared_object_reference_fraction_denominator { 0 };
- };
-
- struct PageOffsetHintTableEntry {
- u32 objects_in_page_number { 0 };
- u32 page_length_number { 0 };
- u32 number_of_shared_objects { 0 };
- Vector<u32> shared_object_identifiers {};
- Vector<u32> shared_object_location_numerators {};
- u32 page_content_stream_offset_number { 0 };
- u32 page_content_stream_length_number { 0 };
- };
-
- friend struct AK::Formatter<LinearizationDictionary>;
- friend struct AK::Formatter<PageOffsetHintTable>;
- friend struct AK::Formatter<PageOffsetHintTableEntry>;
-
- explicit Parser(ReadonlyBytes);
-
- PDFErrorOr<void> parse_header();
- PDFErrorOr<LinearizationResult> initialize_linearization_dict();
- PDFErrorOr<void> initialize_linearized_xref_table();
- PDFErrorOr<void> initialize_non_linearized_xref_table();
- PDFErrorOr<void> initialize_hint_tables();
- PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
- Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);
- PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table();
- PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer();
-
- bool navigate_to_before_eof_marker();
- bool navigate_to_after_startxref();
-
String parse_comment();
PDFErrorOr<Value> parse_value();
@@ -114,9 +47,9 @@ private:
PDFErrorOr<NonnullRefPtr<ArrayObject>> parse_array();
PDFErrorOr<NonnullRefPtr<DictObject>> parse_dict();
PDFErrorOr<NonnullRefPtr<StreamObject>> parse_stream(NonnullRefPtr<DictObject> dict);
-
PDFErrorOr<Vector<Operator>> parse_operators();
+protected:
void push_reference(Reference const& ref) { m_current_reference_stack.append(ref); }
void pop_reference() { m_current_reference_stack.take_last(); }
@@ -130,9 +63,6 @@ private:
Reader m_reader;
WeakPtr<Document> m_document;
- RefPtr<XRefTable> m_xref_table;
- RefPtr<DictObject> m_trailer;
- Optional<LinearizationDictionary> m_linearization_dictionary;
Vector<Reference> m_current_reference_stack;
bool m_disable_encryption { false };
};