diff options
author | Julian Offenhäuser <offenhaeuser@protonmail.com> | 2023-02-11 20:39:40 +0100 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2023-02-12 10:55:37 +0000 |
commit | 34350ee9e7355a564fc94a1bf9e7aac8763ae89a (patch) | |
tree | f4478ac1197103129cb09b64e9406e61bb6d5036 | |
parent | 0c230f5ff071d842f48d5a1447997531f1cd9b99 (diff) | |
download | serenity-34350ee9e7355a564fc94a1bf9e7aac8763ae89a.zip |
LibPDF: Allow reading documents with incremental updates
The PDF spec allows incremental changes of a document by appending a
new XRef table and file trailer to it. These will only contain the
changed objects and will point back to the previous change, forming an
arbitrarily long chain of XRef sections and file trailers.
Every one of those XRef sections may be encoded as an XRef stream as
well, in which case the trailer is part of the stream dictionary as
usual. To make this easier, I made it so every XRef table may "own" a
trailer. This means that the main file trailer is now part of the main
XRef table.
-rw-r--r-- | Userland/Libraries/LibPDF/DocumentParser.cpp | 49 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/DocumentParser.h | 3 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/XRefTable.h | 14 |
3 files changed, 41 insertions, 25 deletions
diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp index c2ae1b2ce6..b4945e3266 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.cpp +++ b/Userland/Libraries/LibPDF/DocumentParser.cpp @@ -186,14 +186,12 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table() // The linearization parameter dictionary has just been parsed, and the xref table // comes immediately after it. We are in the correct spot. m_xref_table = TRY(parse_xref_table()); - if (!m_trailer) - m_trailer = TRY(parse_file_trailer()); // Also parse the main xref table and merge into the first-page xref table. Note // that we don't use the main xref table offset from the linearization dict because // for some reason, it specified the offset of the whitespace after the object // index start and length? So it's much easier to do it this way. - auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int(); + auto main_xref_table_offset = m_xref_table->trailer()->get_value(CommonNames::Prev).to_int(); m_reader.move_to(main_xref_table_offset); auto main_xref_table = TRY(parse_xref_table()); TRY(m_xref_table->merge(move(*main_xref_table))); @@ -267,15 +265,31 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table() return error("No xref"); m_reader.set_reading_forwards(); - auto xref_offset_value = parse_number(); - if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>()) - return error("Invalid xref offset"); - auto xref_offset = xref_offset_value.value().get<int>(); - + auto xref_offset_value = TRY(parse_number()); + auto xref_offset = TRY(m_document->resolve_to<int>(xref_offset_value)); m_reader.move_to(xref_offset); - m_xref_table = TRY(parse_xref_table()); - if (!m_trailer) - m_trailer = TRY(parse_file_trailer()); + + // As per 7.5.6 Incremental Updates: + // When a conforming reader reads the file, it shall build its cross-reference + // information in such a way that the most recent copy of each object shall be + // the one accessed from the file. + // NOTE: This means that we have to follow back the chain of XRef table sections + // and only add objects that were not already specified in a previous + // (and thus newer) XRef section. + while (1) { + auto xref_table = TRY(parse_xref_table()); + if (!m_xref_table) + m_xref_table = xref_table; + else + TRY(m_xref_table->merge(move(*xref_table))); + + if (!xref_table->trailer() || !xref_table->trailer()->contains(CommonNames::Prev)) + break; + + auto offset = TRY(m_document->resolve_to<int>(xref_table->trailer()->get_value(CommonNames::Prev))); + m_reader.move_to(offset); + } + return validate_xref_table_and_fix_if_necessary(); } @@ -406,7 +420,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream() } } - m_trailer = dict; + table->set_trailer(dict); return table; } @@ -424,10 +438,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table() auto table = adopt_ref(*new XRefTable()); - do { - if (m_reader.matches("trailer")) - return table; - + while (m_reader.matches_number()) { Vector<XRefEntry> entries; auto starting_index_value = TRY(parse_number()); @@ -470,7 +481,11 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table() } table->add_section({ starting_index, object_count, entries }); - } while (m_reader.matches_number()); + } + + m_reader.consume_whitespace(); + if (m_reader.matches("trailer")) + table->set_trailer(TRY(parse_file_trailer())); return table; } diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h index 0f58821a73..6f94603d36 100644 --- a/Userland/Libraries/LibPDF/DocumentParser.h +++ b/Userland/Libraries/LibPDF/DocumentParser.h @@ -20,7 +20,7 @@ public: Linearized, }; - [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; } + [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_xref_table->trailer(); } // Parses the header and initializes the xref table and trailer PDFErrorOr<void> initialize(); @@ -94,7 +94,6 @@ private: bool navigate_to_after_startxref(); RefPtr<XRefTable> m_xref_table; - RefPtr<DictObject> m_trailer; Optional<LinearizationDictionary> m_linearization_dictionary; }; diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h index ab289f0d2b..387949c833 100644 --- a/Userland/Libraries/LibPDF/XRefTable.h +++ b/Userland/Libraries/LibPDF/XRefTable.h @@ -35,7 +35,7 @@ public: { auto this_size = m_entries.size(); auto other_size = other.m_entries.size(); - m_entries.ensure_capacity(other_size); + TRY(m_entries.try_ensure_capacity(other_size)); for (size_t i = 0; i < other_size; i++) { auto other_entry = other.m_entries[i]; @@ -46,12 +46,9 @@ public: auto this_entry = m_entries[i]; - if (this_entry.byte_offset == invalid_byte_offset) { + // Only add values that we don't already have. + if (this_entry.byte_offset == invalid_byte_offset) m_entries[i] = other_entry; - } else if (other_entry.byte_offset != invalid_byte_offset) { - // Both xref tables have an entry for the same object index - return Error { Error::Type::Parse, "Conflicting xref entry during merge" }; - } } return {}; @@ -68,8 +65,12 @@ public: m_entries.append(entry); } + void set_trailer(RefPtr<DictObject> trailer) { m_trailer = trailer; } + ALWAYS_INLINE Vector<XRefEntry>& entries() { return m_entries; } + ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; } + [[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const { return index < m_entries.size() && m_entries[index].byte_offset != -1; @@ -113,6 +114,7 @@ private: friend struct AK::Formatter<PDF::XRefTable>; Vector<XRefEntry> m_entries; + RefPtr<DictObject> m_trailer; }; } |