LibPDF: Allow reading documents with incremental updates

The PDF spec allows incremental changes of a document by appending a new XRef table and file trailer to it. These will only contain the changed objects and will point back to the previous change, forming an arbitrarily long chain of XRef sections and file trailers. Every one of those XRef sections may be encoded as an XRef stream as well, in which case the trailer is part of the stream dictionary as usual. To make this easier, I made it so every XRef table may "own" a trailer. This means that the main file trailer is now part of the main XRef table.
author: Julian Offenhäuser <offenhaeuser@protonmail.com> 2023-02-11 20:39:40 +0100
committer: Linus Groh <mail@linusgroh.de> 2023-02-12 10:55:37 +0000
commit: 34350ee9e7355a564fc94a1bf9e7aac8763ae89a (patch)
tree: f4478ac1197103129cb09b64e9406e61bb6d5036
parent: 0c230f5ff071d842f48d5a1447997531f1cd9b99 (diff)
download: serenity-34350ee9e7355a564fc94a1bf9e7aac8763ae89a.zip
3 files changed, 41 insertions, 25 deletions
diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp
index c2ae1b2ce6..b4945e3266 100644
--- a/Userland/Libraries/LibPDF/DocumentParser.cpp
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@@ -186,14 +186,12 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
     // The linearization parameter dictionary has just been parsed, and the xref table
     // comes immediately after it. We are in the correct spot.
     m_xref_table = TRY(parse_xref_table());
-    if (!m_trailer)
-        m_trailer = TRY(parse_file_trailer());
 
     // Also parse the main xref table and merge into the first-page xref table. Note
     // that we don't use the main xref table offset from the linearization dict because
     // for some reason, it specified the offset of the whitespace after the object
     // index start and length? So it's much easier to do it this way.
-    auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
+    auto main_xref_table_offset = m_xref_table->trailer()->get_value(CommonNames::Prev).to_int();
     m_reader.move_to(main_xref_table_offset);
     auto main_xref_table = TRY(parse_xref_table());
     TRY(m_xref_table->merge(move(*main_xref_table)));
@@ -267,15 +265,31 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
         return error("No xref");
 
     m_reader.set_reading_forwards();
-    auto xref_offset_value = parse_number();
-    if (xref_offset_value.is_error() || !xref_offset_value.value().has<int>())
-        return error("Invalid xref offset");
-    auto xref_offset = xref_offset_value.value().get<int>();
-
+    auto xref_offset_value = TRY(parse_number());
+    auto xref_offset = TRY(m_document->resolve_to<int>(xref_offset_value));
     m_reader.move_to(xref_offset);
-    m_xref_table = TRY(parse_xref_table());
-    if (!m_trailer)
-        m_trailer = TRY(parse_file_trailer());
+
+    // As per 7.5.6 Incremental Updates:
+    // When a conforming reader reads the file, it shall build its cross-reference
+    // information in such a way that the most recent copy of each object shall be
+    // the one accessed from the file.
+    // NOTE: This means that we have to follow back the chain of XRef table sections
+    //       and only add objects that were not already specified in a previous
+    //       (and thus newer) XRef section.
+    while (1) {
+        auto xref_table = TRY(parse_xref_table());
+        if (!m_xref_table)
+            m_xref_table = xref_table;
+        else
+            TRY(m_xref_table->merge(move(*xref_table)));
+
+        if (!xref_table->trailer() || !xref_table->trailer()->contains(CommonNames::Prev))
+            break;
+
+        auto offset = TRY(m_document->resolve_to<int>(xref_table->trailer()->get_value(CommonNames::Prev)));
+        m_reader.move_to(offset);
+    }
+
     return validate_xref_table_and_fix_if_necessary();
 }
 
@@ -406,7 +420,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
         }
     }
 
-    m_trailer = dict;
+    table->set_trailer(dict);
 
     return table;
 }
@@ -424,10 +438,7 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
 
     auto table = adopt_ref(*new XRefTable());
 
-    do {
-        if (m_reader.matches("trailer"))
-            return table;
-
+    while (m_reader.matches_number()) {
         Vector<XRefEntry> entries;
 
         auto starting_index_value = TRY(parse_number());
@@ -470,7 +481,11 @@ PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
         }
 
         table->add_section({ starting_index, object_count, entries });
-    } while (m_reader.matches_number());
+    }
+
+    m_reader.consume_whitespace();
+    if (m_reader.matches("trailer"))
+        table->set_trailer(TRY(parse_file_trailer()));
 
     return table;
 }
diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h
index 0f58821a73..6f94603d36 100644
--- a/Userland/Libraries/LibPDF/DocumentParser.h
+++ b/Userland/Libraries/LibPDF/DocumentParser.h
@@ -20,7 +20,7 @@ public:
         Linearized,
     };
 
-    [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
+    [[nodiscard]] ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_xref_table->trailer(); }
 
     // Parses the header and initializes the xref table and trailer
     PDFErrorOr<void> initialize();
@@ -94,7 +94,6 @@ private:
     bool navigate_to_after_startxref();
 
     RefPtr<XRefTable> m_xref_table;
-    RefPtr<DictObject> m_trailer;
     Optional<LinearizationDictionary> m_linearization_dictionary;
 };
 
diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h
index ab289f0d2b..387949c833 100644
--- a/Userland/Libraries/LibPDF/XRefTable.h
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@@ -35,7 +35,7 @@ public:
     {
         auto this_size = m_entries.size();
         auto other_size = other.m_entries.size();
-        m_entries.ensure_capacity(other_size);
+        TRY(m_entries.try_ensure_capacity(other_size));
 
         for (size_t i = 0; i < other_size; i++) {
             auto other_entry = other.m_entries[i];
@@ -46,12 +46,9 @@ public:
 
             auto this_entry = m_entries[i];
 
-            if (this_entry.byte_offset == invalid_byte_offset) {
+            // Only add values that we don't already have.
+            if (this_entry.byte_offset == invalid_byte_offset)
                 m_entries[i] = other_entry;
-            } else if (other_entry.byte_offset != invalid_byte_offset) {
-                // Both xref tables have an entry for the same object index
-                return Error { Error::Type::Parse, "Conflicting xref entry during merge" };
-            }
         }
 
         return {};
@@ -68,8 +65,12 @@ public:
             m_entries.append(entry);
     }
 
+    void set_trailer(RefPtr<DictObject> trailer) { m_trailer = trailer; }
+
     ALWAYS_INLINE Vector<XRefEntry>& entries() { return m_entries; }
 
+    ALWAYS_INLINE RefPtr<DictObject> const& trailer() const { return m_trailer; }
+
     [[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const
     {
         return index < m_entries.size() && m_entries[index].byte_offset != -1;
@@ -113,6 +114,7 @@ private:
     friend struct AK::Formatter<PDF::XRefTable>;
 
     Vector<XRefEntry> m_entries;
+    RefPtr<DictObject> m_trailer;
 };
 
 }
author	Julian Offenhäuser <offenhaeuser@protonmail.com>	2023-02-11 20:39:40 +0100
committer	Linus Groh <mail@linusgroh.de>	2023-02-12 10:55:37 +0000
commit	34350ee9e7355a564fc94a1bf9e7aac8763ae89a (patch)
tree	f4478ac1197103129cb09b64e9406e61bb6d5036
parent	0c230f5ff071d842f48d5a1447997531f1cd9b99 (diff)
download	serenity-34350ee9e7355a564fc94a1bf9e7aac8763ae89a.zip