diff options
-rw-r--r-- | Userland/Libraries/LibPDF/CMakeLists.txt | 1 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/Document.cpp | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/Document.h | 6 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/DocumentParser.cpp | 631 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/DocumentParser.h | 96 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/Parser.cpp | 626 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/Parser.h | 88 |
7 files changed, 746 insertions, 706 deletions
diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt index c405f4fb0e..a5a6519012 100644 --- a/Userland/Libraries/LibPDF/CMakeLists.txt +++ b/Userland/Libraries/LibPDF/CMakeLists.txt @@ -2,6 +2,7 @@ set(SOURCES ColorSpace.cpp CommonNames.cpp Document.cpp + DocumentParser.cpp Encoding.cpp Encryption.cpp Filter.cpp diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp index ebe95e1541..cf6faa0e10 100644 --- a/Userland/Libraries/LibPDF/Document.cpp +++ b/Userland/Libraries/LibPDF/Document.cpp @@ -36,7 +36,7 @@ String OutlineItem::to_string(int indent) const PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes) { - auto parser = adopt_ref(*new Parser({}, bytes)); + auto parser = adopt_ref(*new DocumentParser({}, bytes)); auto document = adopt_ref(*new Document(parser)); TRY(parser->initialize()); @@ -57,7 +57,7 @@ PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes) return document; } -Document::Document(NonnullRefPtr<Parser> const& parser) +Document::Document(NonnullRefPtr<DocumentParser> const& parser) : m_parser(parser) { m_parser->set_document(this); diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h index fee4b14ccc..4c6d39331f 100644 --- a/Userland/Libraries/LibPDF/Document.h +++ b/Userland/Libraries/LibPDF/Document.h @@ -11,10 +11,10 @@ #include <AK/RefCounted.h> #include <AK/Weakable.h> #include <LibGfx/Color.h> +#include <LibPDF/DocumentParser.h> #include <LibPDF/Encryption.h> #include <LibPDF/Error.h> #include <LibPDF/ObjectDerivatives.h> -#include <LibPDF/Parser.h> namespace PDF { @@ -133,7 +133,7 @@ public: } private: - explicit Document(NonnullRefPtr<Parser> const& parser); + explicit Document(NonnullRefPtr<DocumentParser> const& parser); // FIXME: Currently, to improve performance, we don't load any pages at Document // construction, rather we just load the page structure and populate @@ -150,7 +150,7 @@ private: PDFErrorOr<Destination> create_destination_from_parameters(NonnullRefPtr<ArrayObject>); - NonnullRefPtr<Parser> m_parser; + NonnullRefPtr<DocumentParser> m_parser; RefPtr<DictObject> m_catalog; RefPtr<DictObject> m_trailer; Vector<u32> m_page_object_indices; diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp new file mode 100644 index 0000000000..e34302928a --- /dev/null +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -0,0 +1,631 @@ +/* + * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/BitStream.h> +#include <AK/MemoryStream.h> +#include <LibPDF/CommonNames.h> +#include <LibPDF/Document.h> +#include <LibPDF/DocumentParser.h> +#include <LibPDF/ObjectDerivatives.h> + +namespace PDF { + +DocumentParser::DocumentParser(Document* document, ReadonlyBytes bytes) + : Parser(document, bytes) +{ +} + +PDFErrorOr<void> DocumentParser::initialize() +{ + TRY(parse_header()); + + auto const linearization_result = TRY(initialize_linearization_dict()); + + if (linearization_result == LinearizationResult::NotLinearized) + return initialize_non_linearized_xref_table(); + + bool is_linearized = m_linearization_dictionary.has_value(); + if (is_linearized) { + // The file may have been linearized at one point, but could have been updated afterwards, + // which means it is no longer a linearized PDF file. + is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size(); + + if (!is_linearized) { + // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still + // split. This might take some tweaking to ensure correct behavior, which can be + // implemented later. + TODO(); + } + } + + if (is_linearized) + return initialize_linearized_xref_table(); + + return initialize_non_linearized_xref_table(); +} + +PDFErrorOr<Value> DocumentParser::parse_object_with_index(u32 index) +{ + VERIFY(m_xref_table->has_object(index)); + auto byte_offset = m_xref_table->byte_offset_for_object(index); + m_reader.move_to(byte_offset); + auto indirect_value = TRY(parse_indirect_value()); + VERIFY(indirect_value->index() == index); + return indirect_value->value(); +} + +PDFErrorOr<void> DocumentParser::parse_header() +{ + // FIXME: Do something with the version? + m_reader.set_reading_forwards(); + if (m_reader.remaining() == 0) + return error("Empty PDF document"); + + m_reader.move_to(0); + if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-")) + return error("Not a PDF document"); + + m_reader.move_by(5); + + char major_ver = m_reader.read(); + if (major_ver != '1' && major_ver != '2') + return error(String::formatted("Unknown major version \"{}\"", major_ver)); + + if (m_reader.read() != '.') + return error("Malformed PDF version"); + + char minor_ver = m_reader.read(); + if (minor_ver < '0' || minor_ver > '7') + return error(String::formatted("Unknown minor version \"{}\"", minor_ver)); + + m_reader.consume_eol(); + + // Parse optional high-byte comment, which signifies a binary file + // FIXME: Do something with this? + auto comment = parse_comment(); + if (!comment.is_empty()) { + auto binary = comment.length() >= 4; + if (binary) { + for (size_t i = 0; i < comment.length() && binary; i++) + binary = static_cast<u8>(comment[i]) > 128; + } + } + + return {}; +} + +PDFErrorOr<DocumentParser::LinearizationResult> DocumentParser::initialize_linearization_dict() +{ + // parse_header() is called immediately before this, so we are at the right location + auto indirect_value = Value(*TRY(parse_indirect_value())); + auto dict_value = TRY(m_document->resolve(indirect_value)); + if (!dict_value.has<NonnullRefPtr<Object>>()) + return error("Expected linearization object to be a dictionary"); + + auto dict_object = dict_value.get<NonnullRefPtr<Object>>(); + if (!dict_object->is<DictObject>()) + return LinearizationResult::NotLinearized; + + auto dict = dict_object->cast<DictObject>(); + + if (!dict->contains(CommonNames::Linearized)) + return LinearizationResult::NotLinearized; + + if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T)) + return error("Malformed linearization dictionary"); + + auto length_of_file = dict->get_value(CommonNames::L); + auto hint_table = dict->get_value(CommonNames::H); + auto first_page_object_number = dict->get_value(CommonNames::O); + auto offset_of_first_page_end = dict->get_value(CommonNames::E); + auto number_of_pages = dict->get_value(CommonNames::N); + auto offset_of_main_xref_table = dict->get_value(CommonNames::T); + auto first_page = dict->get(CommonNames::P).value_or({}); + + // Validation + if (!length_of_file.has_u32() + || !hint_table.has<NonnullRefPtr<Object>>() + || !first_page_object_number.has_u32() + || !number_of_pages.has_u16() + || !offset_of_main_xref_table.has_u32() + || (!first_page.has<Empty>() && !first_page.has_u32())) { + return error("Malformed linearization dictionary parameters"); + } + + auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>(); + auto hint_table_size = hint_table_array->size(); + if (hint_table_size != 2 && hint_table_size != 4) + return error("Expected hint table to be of length 2 or 4"); + + auto primary_hint_stream_offset = hint_table_array->at(0); + auto primary_hint_stream_length = hint_table_array->at(1); + Value overflow_hint_stream_offset; + Value overflow_hint_stream_length; + + if (hint_table_size == 4) { + overflow_hint_stream_offset = hint_table_array->at(2); + overflow_hint_stream_length = hint_table_array->at(3); + } + + if (!primary_hint_stream_offset.has_u32() + || !primary_hint_stream_length.has_u32() + || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32()) + || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) { + return error("Malformed hint stream"); + } + + m_linearization_dictionary = LinearizationDictionary { + length_of_file.get_u32(), + primary_hint_stream_offset.get_u32(), + primary_hint_stream_length.get_u32(), + overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(), + overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(), + first_page_object_number.get_u32(), + offset_of_first_page_end.get_u32(), + number_of_pages.get_u16(), + offset_of_main_xref_table.get_u32(), + first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(), + }; + + return LinearizationResult::Linearized; +} + +PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table() +{ + // The linearization parameter dictionary has just been parsed, and the xref table + // comes immediately after it. We are in the correct spot. + m_xref_table = TRY(parse_xref_table()); + m_trailer = TRY(parse_file_trailer()); + + // Also parse the main xref table and merge into the first-page xref table. Note + // that we don't use the main xref table offset from the linearization dict because + // for some reason, it specified the offset of the whitespace after the object + // index start and length? So it's much easier to do it this way. + auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int(); + m_reader.move_to(main_xref_table_offset); + auto main_xref_table = TRY(parse_xref_table()); + TRY(m_xref_table->merge(move(*main_xref_table))); + return {}; +} + +PDFErrorOr<void> DocumentParser::initialize_hint_tables() +{ + auto linearization_dict = m_linearization_dictionary.value(); + auto primary_offset = linearization_dict.primary_hint_stream_offset; + auto overflow_offset = linearization_dict.overflow_hint_stream_offset; + + auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> { + m_reader.move_to(offset); + auto stream_indirect_value = parse_indirect_value(); + if (stream_indirect_value.is_error()) + return {}; + + auto stream_value = stream_indirect_value.value()->value(); + if (!stream_value.has<NonnullRefPtr<Object>>()) + return {}; + + auto stream_object = stream_value.get<NonnullRefPtr<Object>>(); + if (!stream_object->is<StreamObject>()) + return {}; + + return stream_object->cast<StreamObject>(); + }; + + auto primary_hint_stream = parse_hint_table(primary_offset); + if (!primary_hint_stream) + return error("Invalid primary hint stream"); + + RefPtr<StreamObject> overflow_hint_stream; + if (overflow_offset != NumericLimits<u32>::max()) + overflow_hint_stream = parse_hint_table(overflow_offset); + + ByteBuffer possible_merged_stream_buffer; + ReadonlyBytes hint_stream_bytes; + + if (overflow_hint_stream) { + auto primary_size = primary_hint_stream->bytes().size(); + auto overflow_size = overflow_hint_stream->bytes().size(); + auto total_size = primary_size + overflow_size; + + auto buffer_result = ByteBuffer::create_uninitialized(total_size); + if (buffer_result.is_error()) + return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" }; + possible_merged_stream_buffer = buffer_result.release_value(); + MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes())); + MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes())); + hint_stream_bytes = possible_merged_stream_buffer.bytes(); + } else { + hint_stream_bytes = primary_hint_stream->bytes(); + } + + auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes)); + auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes); + + // FIXME: Do something with the hint tables + return {}; +} + +PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table() +{ + m_reader.move_to(m_reader.bytes().size() - 1); + if (!navigate_to_before_eof_marker()) + return error("No EOF marker"); + if (!navigate_to_after_startxref()) + return error("No xref"); + + m_reader.set_reading_forwards(); + auto xref_offset_value = parse_number(); + if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>()) + return error("Invalid xref offset"); + auto xref_offset = xref_offset_value.value().get<int>(); + + m_reader.move_to(xref_offset); + m_xref_table = TRY(parse_xref_table()); + m_trailer = TRY(parse_file_trailer()); + return {}; +} + +PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table() +{ + if (!m_reader.matches("xref")) + return error("Expected \"xref\""); + m_reader.move_by(4); + if (!m_reader.consume_eol()) + return error("Expected newline after \"xref\""); + + auto table = adopt_ref(*new XRefTable()); + + do { + if (m_reader.matches("trailer")) + return table; + + Vector<XRefEntry> entries; + + auto starting_index_value = TRY(parse_number()); + auto starting_index = starting_index_value.get<int>(); + auto object_count_value = TRY(parse_number()); + auto object_count = object_count_value.get<int>(); + + for (int i = 0; i < object_count; i++) { + auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10)); + m_reader.move_by(10); + if (!m_reader.consume(' ')) + return error("Malformed xref entry"); + + auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5)); + m_reader.move_by(5); + if (!m_reader.consume(' ')) + return error("Malformed xref entry"); + + auto letter = m_reader.read(); + if (letter != 'n' && letter != 'f') + return error("Malformed xref entry"); + + // The line ending sequence can be one of the following: + // SP CR, SP LF, or CR LF + if (m_reader.matches(' ')) { + m_reader.consume(); + auto ch = m_reader.consume(); + if (ch != '\r' && ch != '\n') + return error("Malformed xref entry"); + } else { + if (!m_reader.matches("\r\n")) + return error("Malformed xref entry"); + m_reader.move_by(2); + } + + auto offset = strtol(offset_string.characters(), nullptr, 10); + auto generation = strtol(generation_string.characters(), nullptr, 10); + + entries.append({ offset, static_cast<u16>(generation), letter == 'n' }); + } + + table->add_section({ starting_index, object_count, entries }); + } while (m_reader.matches_number()); + + return table; +} + +PDFErrorOr<NonnullRefPtr<DictObject>> DocumentParser::parse_file_trailer() +{ + while (m_reader.matches_eol()) + m_reader.consume_eol(); + + if (!m_reader.matches("trailer")) + return error("Expected \"trailer\" keyword"); + m_reader.move_by(7); + m_reader.consume_whitespace(); + auto dict = TRY(parse_dict()); + + if (!m_reader.matches("startxref")) + return error("Expected \"startxref\""); + m_reader.move_by(9); + m_reader.consume_whitespace(); + + m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); + VERIFY(m_reader.consume_eol()); + if (!m_reader.matches("%%EOF")) + return error("Expected \"%%EOF\""); + + m_reader.move_by(5); + m_reader.consume_whitespace(); + return dict; +} + +PDFErrorOr<DocumentParser::PageOffsetHintTable> DocumentParser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes) +{ + if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable)) + return error("Hint stream is too small"); + + size_t offset = 0; + + auto read_u32 = [&] { + u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0]; + offset += 4; + return AK::convert_between_host_and_big_endian(data); + }; + + auto read_u16 = [&] { + u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0]; + offset += 2; + return AK::convert_between_host_and_big_endian(data); + }; + + PageOffsetHintTable hint_table { + read_u32(), + read_u32(), + read_u16(), + read_u32(), + read_u16(), + read_u32(), + read_u16(), + read_u32(), + read_u16(), + read_u16(), + read_u16(), + read_u16(), + read_u16(), + }; + + // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric + // fields in PageOffsetHintTableEntry are u32 + VERIFY(hint_table.bits_required_for_object_number <= 32); + VERIFY(hint_table.bits_required_for_page_length <= 32); + VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32); + VERIFY(hint_table.bits_required_for_content_stream_length <= 32); + VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32); + VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32); + VERIFY(hint_table.bits_required_for_fraction_numerator <= 32); + + return hint_table; +} + +Vector<DocumentParser::PageOffsetHintTableEntry> DocumentParser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes) +{ + InputMemoryStream input_stream(hint_stream_bytes); + input_stream.seek(sizeof(PageOffsetHintTable)); + + InputBitStream bit_stream(input_stream); + + auto number_of_pages = m_linearization_dictionary.value().number_of_pages; + Vector<PageOffsetHintTableEntry> entries; + for (size_t i = 0; i < number_of_pages; i++) + entries.append(PageOffsetHintTableEntry {}); + + auto bits_required_for_object_number = hint_table.bits_required_for_object_number; + auto bits_required_for_page_length = hint_table.bits_required_for_page_length; + auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets; + auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length; + auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs; + auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier; + auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator; + + auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) { + if (bit_size <= 0) + return; + + for (int i = 0; i < number_of_pages; i++) { + auto& entry = entries[i]; + entry.*field = bit_stream.read_bits(bit_size); + } + }; + + auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) { + if (bit_size <= 0) + return; + + for (int page = 1; page < number_of_pages; page++) { + auto number_of_shared_objects = entries[page].number_of_shared_objects; + Vector<u32> items; + items.ensure_capacity(number_of_shared_objects); + + for (size_t i = 0; i < number_of_shared_objects; i++) + items.unchecked_append(bit_stream.read_bits(bit_size)); + + entries[page].*field = move(items); + } + }; + + parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number); + parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length); + parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs); + parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier); + parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator); + parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets); + parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length); + + return entries; +} + +bool DocumentParser::navigate_to_before_eof_marker() +{ + m_reader.set_reading_backwards(); + + while (!m_reader.done()) { + m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); + if (m_reader.done()) + return false; + + m_reader.consume_eol(); + if (!m_reader.matches("%%EOF")) + continue; + + m_reader.move_by(5); + if (!m_reader.matches_eol()) + continue; + m_reader.consume_eol(); + return true; + } + + return false; +} + +bool DocumentParser::navigate_to_after_startxref() +{ + m_reader.set_reading_backwards(); + + while (!m_reader.done()) { + m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); + auto offset = m_reader.offset() + 1; + + m_reader.consume_eol(); + if (!m_reader.matches("startxref")) + continue; + + m_reader.move_by(9); + if (!m_reader.matches_eol()) + continue; + + m_reader.move_to(offset); + return true; + } + + return false; +} + +PDFErrorOr<RefPtr<DictObject>> DocumentParser::conditionally_parse_page_tree_node(u32 object_index) +{ + VERIFY(m_xref_table->has_object(object_index)); + auto byte_offset = m_xref_table->byte_offset_for_object(object_index); + + m_reader.move_to(byte_offset); + TRY(parse_number()); + TRY(parse_number()); + if (!m_reader.matches("obj")) + return error(String::formatted("Invalid page tree offset {}", object_index)); + + m_reader.move_by(3); + m_reader.consume_whitespace(); + + VERIFY(m_reader.consume('<') && m_reader.consume('<')); + + m_reader.consume_whitespace(); + HashMap<FlyString, Value> map; + + while (true) { + if (m_reader.matches(">>")) + break; + auto name = TRY(parse_name()); + auto name_string = name->name(); + if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) { + // This is a page, not a page tree node + return RefPtr<DictObject> {}; + } + + auto value = TRY(parse_value()); + if (name_string == CommonNames::Type) { + if (!value.has<NonnullRefPtr<Object>>()) + return RefPtr<DictObject> {}; + auto type_object = value.get<NonnullRefPtr<Object>>(); + if (!type_object->is<NameObject>()) + return RefPtr<DictObject> {}; + auto type_name = type_object->cast<NameObject>(); + if (type_name->name() != CommonNames::Pages) + return RefPtr<DictObject> {}; + } + map.set(name->name(), value); + } + + VERIFY(m_reader.consume('>') && m_reader.consume('>')); + m_reader.consume_whitespace(); + + return make_object<DictObject>(map); +} + +} + +namespace AK { + +template<> +struct Formatter<PDF::DocumentParser::LinearizationDictionary> : Formatter<StringView> { + ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::LinearizationDictionary const& dict) + { + StringBuilder builder; + builder.append("{\n"sv); + builder.appendff(" length_of_file={}\n", dict.length_of_file); + builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset); + builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length); + builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset); + builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length); + builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number); + builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end); + builder.appendff(" number_of_pages={}\n", dict.number_of_pages); + builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table); + builder.appendff(" first_page={}\n", dict.first_page); + builder.append('}'); + return Formatter<StringView>::format(format_builder, builder.to_string()); + } +}; + +template<> +struct Formatter<PDF::DocumentParser::PageOffsetHintTable> : Formatter<StringView> { + ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTable const& table) + { + StringBuilder builder; + builder.append("{\n"sv); + builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page); + builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object); + builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number); + builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page); + builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length); + builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream); + builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets); + builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length); + builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length); + builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs); + builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier); + builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator); + builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator); + builder.append('}'); + return Formatter<StringView>::format(format_builder, builder.to_string()); + } +}; + +template<> +struct Formatter<PDF::DocumentParser::PageOffsetHintTableEntry> : Formatter<StringView> { + ErrorOr<void> format(FormatBuilder& format_builder, PDF::DocumentParser::PageOffsetHintTableEntry const& entry) + { + StringBuilder builder; + builder.append("{\n"sv); + builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number); + builder.appendff(" page_length_number={}\n", entry.page_length_number); + builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects); + builder.append(" shared_object_identifiers=["sv); + for (auto& identifier : entry.shared_object_identifiers) + builder.appendff(" {}", identifier); + builder.append(" ]\n"sv); + builder.append(" shared_object_location_numerators=["sv); + for (auto& numerator : entry.shared_object_location_numerators) + builder.appendff(" {}", numerator); + builder.append(" ]\n"sv); + builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number); + builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number); + builder.append('}'); + return Formatter<StringView>::format(format_builder, builder.to_string()); + } +}; + +} diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h new file mode 100644 index 0000000000..fef5f40ad4 --- /dev/null +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2021-2022, Matthew Olsson <mattco@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <LibPDF/Parser.h> + +namespace PDF { + +class DocumentParser final : public RefCounted<DocumentParser> + , public Parser { +public: + DocumentParser(Document*, ReadonlyBytes); + + enum class LinearizationResult { + NotLinearized, + Linearized, + }; + + [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; } + + // Parses the header and initializes the xref table and trailer + PDFErrorOr<void> initialize(); + + PDFErrorOr<Value> parse_object_with_index(u32 index); + + // Specialized version of parse_dict which aborts early if the dict being parsed + // is not a page object + PDFErrorOr<RefPtr<DictObject>> conditionally_parse_page_tree_node(u32 object_index); + +private: + struct LinearizationDictionary { + u32 length_of_file { 0 }; + u32 primary_hint_stream_offset { 0 }; + u32 primary_hint_stream_length { 0 }; + u32 overflow_hint_stream_offset { 0 }; + u32 overflow_hint_stream_length { 0 }; + u32 first_page_object_number { 0 }; + u32 offset_of_first_page_end { 0 }; + u16 number_of_pages { 0 }; + u32 offset_of_main_xref_table { 0 }; + u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here) + }; + + struct PageOffsetHintTable { + u32 least_number_of_objects_in_a_page { 0 }; + u32 location_of_first_page_object { 0 }; + u16 bits_required_for_object_number { 0 }; + u32 least_length_of_a_page { 0 }; + u16 bits_required_for_page_length { 0 }; + u32 least_offset_of_any_content_stream { 0 }; + u16 bits_required_for_content_stream_offsets { 0 }; + u32 least_content_stream_length { 0 }; + u16 bits_required_for_content_stream_length { 0 }; + u16 bits_required_for_number_of_shared_obj_refs { 0 }; + u16 bits_required_for_greatest_shared_obj_identifier { 0 }; + u16 bits_required_for_fraction_numerator { 0 }; + u16 shared_object_reference_fraction_denominator { 0 }; + }; + + struct PageOffsetHintTableEntry { + u32 objects_in_page_number { 0 }; + u32 page_length_number { 0 }; + u32 number_of_shared_objects { 0 }; + Vector<u32> shared_object_identifiers {}; + Vector<u32> shared_object_location_numerators {}; + u32 page_content_stream_offset_number { 0 }; + u32 page_content_stream_length_number { 0 }; + }; + + friend struct AK::Formatter<LinearizationDictionary>; + friend struct AK::Formatter<PageOffsetHintTable>; + friend struct AK::Formatter<PageOffsetHintTableEntry>; + + PDFErrorOr<void> parse_header(); + PDFErrorOr<LinearizationResult> initialize_linearization_dict(); + PDFErrorOr<void> initialize_linearized_xref_table(); + PDFErrorOr<void> initialize_non_linearized_xref_table(); + PDFErrorOr<void> initialize_hint_tables(); + PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes); + Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes); + PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table(); + PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer(); + + bool navigate_to_before_eof_marker(); + bool navigate_to_after_startxref(); + + RefPtr<XRefTable> m_xref_table; + RefPtr<DictObject> m_trailer; + Optional<LinearizationDictionary> m_linearization_dictionary; +}; + +} diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index 36b018d89c..4df9d59f6e 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -4,8 +4,6 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include <AK/BitStream.h> -#include <AK/MemoryStream.h> #include <AK/ScopeGuard.h> #include <LibPDF/CommonNames.h> #include <LibPDF/Document.h> @@ -16,17 +14,11 @@ namespace PDF { -template<typename T, typename... Args> -static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>) -{ - return adopt_ref(*new T(forward<Args>(args)...)); -} - PDFErrorOr<Vector<Operator>> Parser::parse_operators(Document* document, ReadonlyBytes bytes) { - auto parser = adopt_ref(*new Parser(document, bytes)); - parser->m_disable_encryption = true; - return parser->parse_operators(); + Parser parser(document, bytes); + parser.m_disable_encryption = true; + return parser.parse_operators(); } Parser::Parser(Document* document, ReadonlyBytes bytes) @@ -45,494 +37,6 @@ void Parser::set_document(WeakPtr<Document> const& document) m_document = document; } -PDFErrorOr<void> Parser::initialize() -{ - TRY(parse_header()); - - auto const linearization_result = TRY(initialize_linearization_dict()); - - if (linearization_result == LinearizationResult::NotLinearized) - return initialize_non_linearized_xref_table(); - - bool is_linearized = m_linearization_dictionary.has_value(); - if (is_linearized) { - // The file may have been linearized at one point, but could have been updated afterwards, - // which means it is no longer a linearized PDF file. - is_linearized = m_linearization_dictionary.value().length_of_file == m_reader.bytes().size(); - - if (!is_linearized) { - // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still - // split. This might take some tweaking to ensure correct behavior, which can be - // implemented later. - TODO(); - } - } - - if (is_linearized) - return initialize_linearized_xref_table(); - - return initialize_non_linearized_xref_table(); -} - -PDFErrorOr<Value> Parser::parse_object_with_index(u32 index) -{ - VERIFY(m_xref_table->has_object(index)); - auto byte_offset = m_xref_table->byte_offset_for_object(index); - m_reader.move_to(byte_offset); - auto indirect_value = TRY(parse_indirect_value()); - VERIFY(indirect_value->index() == index); - return indirect_value->value(); -} - -PDFErrorOr<void> Parser::parse_header() -{ - // FIXME: Do something with the version? - m_reader.set_reading_forwards(); - if (m_reader.remaining() == 0) - return error("Empty PDF document"); - - m_reader.move_to(0); - if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-")) - return error("Not a PDF document"); - - m_reader.move_by(5); - - char major_ver = m_reader.read(); - if (major_ver != '1' && major_ver != '2') - return error(String::formatted("Unknown major version \"{}\"", major_ver)); - - if (m_reader.read() != '.') - return error("Malformed PDF version"); - - char minor_ver = m_reader.read(); - if (minor_ver < '0' || minor_ver > '7') - return error(String::formatted("Unknown minor version \"{}\"", minor_ver)); - - m_reader.consume_eol(); - - // Parse optional high-byte comment, which signifies a binary file - // FIXME: Do something with this? - auto comment = parse_comment(); - if (!comment.is_empty()) { - auto binary = comment.length() >= 4; - if (binary) { - for (size_t i = 0; i < comment.length() && binary; i++) - binary = static_cast<u8>(comment[i]) > 128; - } - } - - return {}; -} - -PDFErrorOr<Parser::LinearizationResult> Parser::initialize_linearization_dict() -{ - // parse_header() is called immediately before this, so we are at the right location - auto indirect_value = Value(*TRY(parse_indirect_value())); - auto dict_value = TRY(m_document->resolve(indirect_value)); - if (!dict_value.has<NonnullRefPtr<Object>>()) - return error("Expected linearization object to be a dictionary"); - - auto dict_object = dict_value.get<NonnullRefPtr<Object>>(); - if (!dict_object->is<DictObject>()) - return LinearizationResult::NotLinearized; - - auto dict = dict_object->cast<DictObject>(); - - if (!dict->contains(CommonNames::Linearized)) - return LinearizationResult::NotLinearized; - - if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T)) - return error("Malformed linearization dictionary"); - - auto length_of_file = dict->get_value(CommonNames::L); - auto hint_table = dict->get_value(CommonNames::H); - auto first_page_object_number = dict->get_value(CommonNames::O); - auto offset_of_first_page_end = dict->get_value(CommonNames::E); - auto number_of_pages = dict->get_value(CommonNames::N); - auto offset_of_main_xref_table = dict->get_value(CommonNames::T); - auto first_page = dict->get(CommonNames::P).value_or({}); - - // Validation - if (!length_of_file.has_u32() - || !hint_table.has<NonnullRefPtr<Object>>() - || !first_page_object_number.has_u32() - || !number_of_pages.has_u16() - || !offset_of_main_xref_table.has_u32() - || (!first_page.has<Empty>() && !first_page.has_u32())) { - return error("Malformed linearization dictionary parameters"); - } - - auto hint_table_array = hint_table.get<NonnullRefPtr<Object>>()->cast<ArrayObject>(); - auto hint_table_size = hint_table_array->size(); - if (hint_table_size != 2 && hint_table_size != 4) - return error("Expected hint table to be of length 2 or 4"); - - auto primary_hint_stream_offset = hint_table_array->at(0); - auto primary_hint_stream_length = hint_table_array->at(1); - Value overflow_hint_stream_offset; - Value overflow_hint_stream_length; - - if (hint_table_size == 4) { - overflow_hint_stream_offset = hint_table_array->at(2); - overflow_hint_stream_length = hint_table_array->at(3); - } - - if (!primary_hint_stream_offset.has_u32() - || !primary_hint_stream_length.has_u32() - || (!overflow_hint_stream_offset.has<Empty>() && !overflow_hint_stream_offset.has_u32()) - || (!overflow_hint_stream_length.has<Empty>() && !overflow_hint_stream_length.has_u32())) { - return error("Malformed hint stream"); - } - - m_linearization_dictionary = LinearizationDictionary { - length_of_file.get_u32(), - primary_hint_stream_offset.get_u32(), - primary_hint_stream_length.get_u32(), - overflow_hint_stream_offset.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_offset.get_u32(), - overflow_hint_stream_length.has<Empty>() ? NumericLimits<u32>::max() : overflow_hint_stream_length.get_u32(), - first_page_object_number.get_u32(), - offset_of_first_page_end.get_u32(), - number_of_pages.get_u16(), - offset_of_main_xref_table.get_u32(), - first_page.has<Empty>() ? NumericLimits<u32>::max() : first_page.get_u32(), - }; - - return LinearizationResult::Linearized; -} - -PDFErrorOr<void> Parser::initialize_linearized_xref_table() -{ - // The linearization parameter dictionary has just been parsed, and the xref table - // comes immediately after it. We are in the correct spot. - m_xref_table = TRY(parse_xref_table()); - m_trailer = TRY(parse_file_trailer()); - - // Also parse the main xref table and merge into the first-page xref table. Note - // that we don't use the main xref table offset from the linearization dict because - // for some reason, it specified the offset of the whitespace after the object - // index start and length? So it's much easier to do it this way. - auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int(); - m_reader.move_to(main_xref_table_offset); - auto main_xref_table = TRY(parse_xref_table()); - TRY(m_xref_table->merge(move(*main_xref_table))); - return {}; -} - -PDFErrorOr<void> Parser::initialize_hint_tables() -{ - auto linearization_dict = m_linearization_dictionary.value(); - auto primary_offset = linearization_dict.primary_hint_stream_offset; - auto overflow_offset = linearization_dict.overflow_hint_stream_offset; - - auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> { - m_reader.move_to(offset); - auto stream_indirect_value = parse_indirect_value(); - if (stream_indirect_value.is_error()) - return {}; - - auto stream_value = stream_indirect_value.value()->value(); - if (!stream_value.has<NonnullRefPtr<Object>>()) - return {}; - - auto stream_object = stream_value.get<NonnullRefPtr<Object>>(); - if (!stream_object->is<StreamObject>()) - return {}; - - return stream_object->cast<StreamObject>(); - }; - - auto primary_hint_stream = parse_hint_table(primary_offset); - if (!primary_hint_stream) - return error("Invalid primary hint stream"); - - RefPtr<StreamObject> overflow_hint_stream; - if (overflow_offset != NumericLimits<u32>::max()) - overflow_hint_stream = parse_hint_table(overflow_offset); - - ByteBuffer possible_merged_stream_buffer; - ReadonlyBytes hint_stream_bytes; - - if (overflow_hint_stream) { - auto primary_size = primary_hint_stream->bytes().size(); - auto overflow_size = overflow_hint_stream->bytes().size(); - auto total_size = primary_size + overflow_size; - - auto buffer_result = ByteBuffer::create_uninitialized(total_size); - if (buffer_result.is_error()) - return Error { Error::Type::Internal, "Failed to allocate hint stream buffer" }; - possible_merged_stream_buffer = buffer_result.release_value(); - MUST(possible_merged_stream_buffer.try_append(primary_hint_stream->bytes())); - MUST(possible_merged_stream_buffer.try_append(overflow_hint_stream->bytes())); - hint_stream_bytes = possible_merged_stream_buffer.bytes(); - } else { - hint_stream_bytes = primary_hint_stream->bytes(); - } - - auto hint_table = TRY(parse_page_offset_hint_table(hint_stream_bytes)); - auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table, hint_stream_bytes); - - // FIXME: Do something with the hint tables - return {}; -} - -PDFErrorOr<void> Parser::initialize_non_linearized_xref_table() -{ - m_reader.move_to(m_reader.bytes().size() - 1); - if (!navigate_to_before_eof_marker()) - return error("No EOF marker"); - if (!navigate_to_after_startxref()) - return error("No xref"); - - m_reader.set_reading_forwards(); - auto xref_offset_value = parse_number(); - if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>()) - return error("Invalid xref offset"); - auto xref_offset = xref_offset_value.value().get<int>(); - - m_reader.move_to(xref_offset); - m_xref_table = TRY(parse_xref_table()); - m_trailer = TRY(parse_file_trailer()); - return {}; -} - -PDFErrorOr<NonnullRefPtr<XRefTable>> Parser::parse_xref_table() -{ - if (!m_reader.matches("xref")) - return error("Expected \"xref\""); - m_reader.move_by(4); - if (!m_reader.consume_eol()) - return error("Expected newline after \"xref\""); - - auto table = adopt_ref(*new XRefTable()); - - do { - if (m_reader.matches("trailer")) - return table; - - Vector<XRefEntry> entries; - - auto starting_index_value = TRY(parse_number()); - auto starting_index = starting_index_value.get<int>(); - auto object_count_value = TRY(parse_number()); - auto object_count = object_count_value.get<int>(); - - for (int i = 0; i < object_count; i++) { - auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10)); - m_reader.move_by(10); - if (!m_reader.consume(' ')) - return error("Malformed xref entry"); - - auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5)); - m_reader.move_by(5); - if (!m_reader.consume(' ')) - return error("Malformed xref entry"); - - auto letter = m_reader.read(); - if (letter != 'n' && letter != 'f') - return error("Malformed xref entry"); - - // The line ending sequence can be one of the following: - // SP CR, SP LF, or CR LF - if (m_reader.matches(' ')) { - m_reader.consume(); - auto ch = m_reader.consume(); - if (ch != '\r' && ch != '\n') - return error("Malformed xref entry"); - } else { - if (!m_reader.matches("\r\n")) - return error("Malformed xref entry"); - m_reader.move_by(2); - } - - auto offset = strtol(offset_string.characters(), nullptr, 10); - auto generation = strtol(generation_string.characters(), nullptr, 10); - - entries.append({ offset, static_cast<u16>(generation), letter == 'n' }); - } - - table->add_section({ starting_index, object_count, entries }); - } while (m_reader.matches_number()); - - return table; -} - -PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_file_trailer() -{ - while (m_reader.matches_eol()) - m_reader.consume_eol(); - - if (!m_reader.matches("trailer")) - return error("Expected \"trailer\" keyword"); - m_reader.move_by(7); - m_reader.consume_whitespace(); - auto dict = TRY(parse_dict()); - - if (!m_reader.matches("startxref")) - return error("Expected \"startxref\""); - m_reader.move_by(9); - m_reader.consume_whitespace(); - - m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); - VERIFY(m_reader.consume_eol()); - if (!m_reader.matches("%%EOF")) - return error("Expected \"%%EOF\""); - - m_reader.move_by(5); - m_reader.consume_whitespace(); - return dict; -} - -PDFErrorOr<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes) -{ - if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable)) - return error("Hint stream is too small"); - - size_t offset = 0; - - auto read_u32 = [&] { - u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0]; - offset += 4; - return AK::convert_between_host_and_big_endian(data); - }; - - auto read_u16 = [&] { - u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0]; - offset += 2; - return AK::convert_between_host_and_big_endian(data); - }; - - PageOffsetHintTable hint_table { - read_u32(), - read_u32(), - read_u16(), - read_u32(), - read_u16(), - read_u32(), - read_u16(), - read_u32(), - read_u16(), - read_u16(), - read_u16(), - read_u16(), - read_u16(), - }; - - // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric - // fields in PageOffsetHintTableEntry are u32 - VERIFY(hint_table.bits_required_for_object_number <= 32); - VERIFY(hint_table.bits_required_for_page_length <= 32); - VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32); - VERIFY(hint_table.bits_required_for_content_stream_length <= 32); - VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32); - VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32); - VERIFY(hint_table.bits_required_for_fraction_numerator <= 32); - - return hint_table; -} - -Vector<Parser::PageOffsetHintTableEntry> Parser::parse_all_page_offset_hint_table_entries(PageOffsetHintTable const& hint_table, ReadonlyBytes hint_stream_bytes) -{ - InputMemoryStream input_stream(hint_stream_bytes); - input_stream.seek(sizeof(PageOffsetHintTable)); - - InputBitStream bit_stream(input_stream); - - auto number_of_pages = m_linearization_dictionary.value().number_of_pages; - Vector<PageOffsetHintTableEntry> entries; - for (size_t i = 0; i < number_of_pages; i++) - entries.append(PageOffsetHintTableEntry {}); - - auto bits_required_for_object_number = hint_table.bits_required_for_object_number; - auto bits_required_for_page_length = hint_table.bits_required_for_page_length; - auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets; - auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length; - auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs; - auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier; - auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator; - - auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) { - if (bit_size <= 0) - return; - - for (int i = 0; i < number_of_pages; i++) { - auto& entry = entries[i]; - entry.*field = bit_stream.read_bits(bit_size); - } - }; - - auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) { - if (bit_size <= 0) - return; - - for (int page = 1; page < number_of_pages; page++) { - auto number_of_shared_objects = entries[page].number_of_shared_objects; - Vector<u32> items; - items.ensure_capacity(number_of_shared_objects); - - for (size_t i = 0; i < number_of_shared_objects; i++) - items.unchecked_append(bit_stream.read_bits(bit_size)); - - entries[page].*field = move(items); - } - }; - - parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number); - parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length); - parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs); - parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier); - parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator); - parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets); - parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length); - - return entries; -} - -bool Parser::navigate_to_before_eof_marker() -{ - m_reader.set_reading_backwards(); - - while (!m_reader.done()) { - m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); - if (m_reader.done()) - return false; - - m_reader.consume_eol(); - if (!m_reader.matches("%%EOF")) - continue; - - m_reader.move_by(5); - if (!m_reader.matches_eol()) - continue; - m_reader.consume_eol(); - return true; - } - - return false; -} - -bool Parser::navigate_to_after_startxref() -{ - m_reader.set_reading_backwards(); - - while (!m_reader.done()) { - m_reader.move_until([&](auto) { return m_reader.matches_eol(); }); - auto offset = m_reader.offset() + 1; - - m_reader.consume_eol(); - if (!m_reader.matches("startxref")) - continue; - - m_reader.move_by(9); - if (!m_reader.matches_eol()) - continue; - - m_reader.move_to(offset); - return true; - } - - return false; -} - String Parser::parse_comment() { if (!m_reader.matches('%')) @@ -924,55 +428,6 @@ PDFErrorOr<NonnullRefPtr<DictObject>> Parser::parse_dict() return make_object<DictObject>(map); } -PDFErrorOr<RefPtr<DictObject>> Parser::conditionally_parse_page_tree_node(u32 object_index) -{ - VERIFY(m_xref_table->has_object(object_index)); - auto byte_offset = m_xref_table->byte_offset_for_object(object_index); - - m_reader.move_to(byte_offset); - TRY(parse_number()); - TRY(parse_number()); - if (!m_reader.matches("obj")) - return error(String::formatted("Invalid page tree offset {}", object_index)); - - m_reader.move_by(3); - m_reader.consume_whitespace(); - - VERIFY(m_reader.consume('<') && m_reader.consume('<')); - - m_reader.consume_whitespace(); - HashMap<FlyString, Value> map; - - while (true) { - if (m_reader.matches(">>")) - break; - auto name = TRY(parse_name()); - auto name_string = name->name(); - if (!name_string.is_one_of(CommonNames::Type, CommonNames::Parent, CommonNames::Kids, CommonNames::Count)) { - // This is a page, not a page tree node - return RefPtr<DictObject> {}; - } - - auto value = TRY(parse_value()); - if (name_string == CommonNames::Type) { - if (!value.has<NonnullRefPtr<Object>>()) - return RefPtr<DictObject> {}; - auto type_object = value.get<NonnullRefPtr<Object>>(); - if (!type_object->is<NameObject>()) - return RefPtr<DictObject> {}; - auto type_name = type_object->cast<NameObject>(); - if (type_name->name() != CommonNames::Pages) - return RefPtr<DictObject> {}; - } - map.set(name->name(), value); - } - - VERIFY(m_reader.consume('>') && m_reader.consume('>')); - m_reader.consume_whitespace(); - - return make_object<DictObject>(map); -} - PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictObject> dict) { if (!m_reader.matches("stream")) @@ -984,7 +439,7 @@ PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictO ReadonlyBytes bytes; auto maybe_length = dict->get(CommonNames::Length); - if (maybe_length.has_value() && (!maybe_length->has<Reference>() || m_xref_table)) { + if (maybe_length.has_value() && (!maybe_length->has<Reference>())) { // The PDF writer has kindly provided us with the direct length of the stream m_reader.save(); auto length = TRY(m_document->resolve_to<int>(maybe_length.value())); @@ -1080,76 +535,3 @@ Error Parser::error( } } - -namespace AK { - -template<> -struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> { - ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::LinearizationDictionary const& dict) - { - StringBuilder builder; - builder.append("{\n"sv); - builder.appendff(" length_of_file={}\n", dict.length_of_file); - builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset); - builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length); - builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset); - builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length); - builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number); - builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end); - builder.appendff(" number_of_pages={}\n", dict.number_of_pages); - builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table); - builder.appendff(" first_page={}\n", dict.first_page); - builder.append('}'); - return Formatter<StringView>::format(format_builder, builder.to_string()); - } -}; - -template<> -struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> { - ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTable const& table) - { - StringBuilder builder; - builder.append("{\n"sv); - builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page); - builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object); - builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number); - builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page); - builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length); - builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream); - builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets); - builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length); - builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length); - builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs); - builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier); - builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator); - builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator); - builder.append('}'); - return Formatter<StringView>::format(format_builder, builder.to_string()); - } -}; - -template<> -struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> { - ErrorOr<void> format(FormatBuilder& format_builder, PDF::Parser::PageOffsetHintTableEntry const& entry) - { - StringBuilder builder; - builder.append("{\n"sv); - builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number); - builder.appendff(" page_length_number={}\n", entry.page_length_number); - builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects); - builder.append(" shared_object_identifiers=["sv); - for (auto& identifier : entry.shared_object_identifiers) - builder.appendff(" {}", identifier); - builder.append(" ]\n"sv); - builder.append(" shared_object_location_numerators=["sv); - for (auto& numerator : entry.shared_object_location_numerators) - builder.appendff(" {}", numerator); - builder.append(" ]\n"sv); - builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number); - builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number); - builder.append('}'); - return Formatter<StringView>::format(format_builder, builder.to_string()); - } -}; - -} diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h index 7fcf82478e..26eb6b9079 100644 --- a/Userland/Libraries/LibPDF/Parser.h +++ b/Userland/Libraries/LibPDF/Parser.h @@ -16,90 +16,23 @@ namespace PDF { +template<typename T, typename... Args> +static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>) +{ + return adopt_ref(*new T(forward<Args>(args)...)); +} + class Document; -class Parser final : public RefCounted<Parser> { +class Parser { public: - enum class LinearizationResult { - NotLinearized, - Linearized, - }; - static PDFErrorOr<Vector<Operator>> parse_operators(Document*, ReadonlyBytes); + Parser(ReadonlyBytes); Parser(Document*, ReadonlyBytes); - [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; } void set_document(WeakPtr<Document> const&); - // Parses the header and initializes the xref table and trailer - PDFErrorOr<void> initialize(); - - PDFErrorOr<Value> parse_object_with_index(u32 index); - - // Specialized version of parse_dict which aborts early if the dict being parsed - // is not a page object - PDFErrorOr<RefPtr<DictObject>> conditionally_parse_page_tree_node(u32 object_index); - -private: - struct LinearizationDictionary { - u32 length_of_file { 0 }; - u32 primary_hint_stream_offset { 0 }; - u32 primary_hint_stream_length { 0 }; - u32 overflow_hint_stream_offset { 0 }; - u32 overflow_hint_stream_length { 0 }; - u32 first_page_object_number { 0 }; - u32 offset_of_first_page_end { 0 }; - u16 number_of_pages { 0 }; - u32 offset_of_main_xref_table { 0 }; - u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here) - }; - - struct PageOffsetHintTable { - u32 least_number_of_objects_in_a_page { 0 }; - u32 location_of_first_page_object { 0 }; - u16 bits_required_for_object_number { 0 }; - u32 least_length_of_a_page { 0 }; - u16 bits_required_for_page_length { 0 }; - u32 least_offset_of_any_content_stream { 0 }; - u16 bits_required_for_content_stream_offsets { 0 }; - u32 least_content_stream_length { 0 }; - u16 bits_required_for_content_stream_length { 0 }; - u16 bits_required_for_number_of_shared_obj_refs { 0 }; - u16 bits_required_for_greatest_shared_obj_identifier { 0 }; - u16 bits_required_for_fraction_numerator { 0 }; - u16 shared_object_reference_fraction_denominator { 0 }; - }; - - struct PageOffsetHintTableEntry { - u32 objects_in_page_number { 0 }; - u32 page_length_number { 0 }; - u32 number_of_shared_objects { 0 }; - Vector<u32> shared_object_identifiers {}; - Vector<u32> shared_object_location_numerators {}; - u32 page_content_stream_offset_number { 0 }; - u32 page_content_stream_length_number { 0 }; - }; - - friend struct AK::Formatter<LinearizationDictionary>; - friend struct AK::Formatter<PageOffsetHintTable>; - friend struct AK::Formatter<PageOffsetHintTableEntry>; - - explicit Parser(ReadonlyBytes); - - PDFErrorOr<void> parse_header(); - PDFErrorOr<LinearizationResult> initialize_linearization_dict(); - PDFErrorOr<void> initialize_linearized_xref_table(); - PDFErrorOr<void> initialize_non_linearized_xref_table(); - PDFErrorOr<void> initialize_hint_tables(); - PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes); - Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes); - PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table(); - PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer(); - - bool navigate_to_before_eof_marker(); - bool navigate_to_after_startxref(); - String parse_comment(); PDFErrorOr<Value> parse_value(); @@ -114,9 +47,9 @@ private: PDFErrorOr<NonnullRefPtr<ArrayObject>> parse_array(); PDFErrorOr<NonnullRefPtr<DictObject>> parse_dict(); PDFErrorOr<NonnullRefPtr<StreamObject>> parse_stream(NonnullRefPtr<DictObject> dict); - PDFErrorOr<Vector<Operator>> parse_operators(); +protected: void push_reference(Reference const& ref) { m_current_reference_stack.append(ref); } void pop_reference() { m_current_reference_stack.take_last(); } @@ -130,9 +63,6 @@ private: Reader m_reader; WeakPtr<Document> m_document; - RefPtr<XRefTable> m_xref_table; - RefPtr<DictObject> m_trailer; - Optional<LinearizationDictionary> m_linearization_dictionary; Vector<Reference> m_current_reference_stack; bool m_disable_encryption { false }; }; |