LibPDF: Initial work on parsing xref streams

Since PDF version 1.5, a document may omit the xref table in favor of a new kind of xref stream object. This is used to reference so-called "compressed" objects that are part of an object stream. With this patch we are able to parse this new kind of xref object, but we'll have to implement object streams to use them correctly.
author: Julian Offenhäuser <metalvoidzz@gmail.com> 2022-08-15 12:04:59 +0200
committer: Sam Atkins <atkinssj@gmail.com> 2022-09-17 10:07:14 +0100
commit: f9beff7b5e6adad71922fc080a89b27f45d544ea (patch)
tree: 3025c844a2e7d7c712d010d9a31c2c3a31dd93a0
parent: 4887aacec7ee4d659e24a3a16ef48b6b42da16ed (diff)
download: serenity-f9beff7b5e6adad71922fc080a89b27f45d544ea.zip
4 files changed, 108 insertions, 4 deletions
diff --git a/Userland/Libraries/LibPDF/CommonNames.h b/Userland/Libraries/LibPDF/CommonNames.h
index 790690d45d..a436d1b36e 100644
--- a/Userland/Libraries/LibPDF/CommonNames.h
+++ b/Userland/Libraries/LibPDF/CommonNames.h
@@ -70,6 +70,7 @@
     A(HTO)                        \
     A(ICCBased)                   \
     A(ID)                         \
+    A(Index)                      \
     A(JBIG2Decode)                \
     A(JPXDecode)                  \
     A(Kids)                       \
diff --git a/Userland/Libraries/LibPDF/DocumentParser.cpp b/Userland/Libraries/LibPDF/DocumentParser.cpp
index e34302928a..0dee963839 100644
--- a/Userland/Libraries/LibPDF/DocumentParser.cpp
+++ b/Userland/Libraries/LibPDF/DocumentParser.cpp
@@ -6,6 +6,7 @@
 
 #include <AK/BitStream.h>
 #include <AK/MemoryStream.h>
+#include <AK/Tuple.h>
 #include <LibPDF/CommonNames.h>
 #include <LibPDF/Document.h>
 #include <LibPDF/DocumentParser.h>
@@ -178,7 +179,8 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
     // The linearization parameter dictionary has just been parsed, and the xref table
     // comes immediately after it. We are in the correct spot.
     m_xref_table = TRY(parse_xref_table());
-    m_trailer = TRY(parse_file_trailer());
+    if (!m_trailer)
+        m_trailer = TRY(parse_file_trailer());
 
     // Also parse the main xref table and merge into the first-page xref table. Note
     // that we don't use the main xref table offset from the linearization dict because
@@ -188,6 +190,7 @@ PDFErrorOr<void> DocumentParser::initialize_linearized_xref_table()
     m_reader.move_to(main_xref_table_offset);
     auto main_xref_table = TRY(parse_xref_table());
     TRY(m_xref_table->merge(move(*main_xref_table)));
+
     return {};
 }
 
@@ -264,14 +267,96 @@ PDFErrorOr<void> DocumentParser::initialize_non_linearized_xref_table()
 
     m_reader.move_to(xref_offset);
     m_xref_table = TRY(parse_xref_table());
-    m_trailer = TRY(parse_file_trailer());
+    if (!m_trailer)
+        m_trailer = TRY(parse_file_trailer());
     return {};
 }
 
+PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_stream()
+{
+    auto first_number = TRY(parse_number());
+    auto second_number = TRY(parse_number());
+
+    if (!m_reader.matches("obj"))
+        return error("Malformed xref object");
+    m_reader.move_by(3);
+    if (m_reader.matches_eol())
+        m_reader.consume_eol();
+
+    auto dict = TRY(parse_dict());
+    auto type = TRY(dict->get_name(m_document, CommonNames::Type))->name();
+    if (type != "XRef")
+        return error("Malformed xref dictionary");
+
+    auto field_sizes = TRY(dict->get_array(m_document, "W"));
+    if (field_sizes->size() != 3)
+        return error("Malformed xref dictionary");
+
+    auto object_count = dict->get_value("Size").get<int>();
+
+    Vector<Tuple<int, int>> subsection_indices;
+    if (dict->contains(CommonNames::Index)) {
+        auto index_array = TRY(dict->get_array(m_document, CommonNames::Index));
+        if (index_array->size() % 2 != 0)
+            return error("Malformed xref dictionary");
+
+        for (size_t i = 0; i < index_array->size(); i += 2)
+            subsection_indices.append({ index_array->at(i).get<int>(), index_array->at(i + 1).get<int>() - 1 });
+    } else {
+        subsection_indices.append({ 0, object_count - 1 });
+    }
+    auto stream = TRY(parse_stream(dict));
+    auto table = adopt_ref(*new XRefTable());
+
+    auto field_to_long = [](Span<const u8> field) -> long {
+        long value = 0;
+        const u8 max = (field.size() - 1) * 8;
+        for (size_t i = 0; i < field.size(); ++i) {
+            value |= static_cast<long>(field[i]) << (max - (i * 8));
+        }
+        return value;
+    };
+
+    size_t byte_index = 0;
+    size_t subsection_index = 0;
+
+    Vector<XRefEntry> entries;
+
+    for (int entry_index = 0; entry_index < object_count; ++entry_index) {
+        Array<long, 3> fields;
+        for (size_t field_index = 0; field_index < 3; ++field_index) {
+            auto field_size = field_sizes->at(field_index).get_u32();
+            auto field = stream->bytes().slice(byte_index, field_size);
+            fields[field_index] = field_to_long(field);
+            byte_index += field_size;
+        }
+
+        u8 type = fields[0];
+        if (!field_sizes->at(0).get_u32())
+            type = 1;
+
+        entries.append({ fields[1], static_cast<u16>(fields[2]), type != 0, type == 2 });
+
+        auto indices = subsection_indices[subsection_index];
+        if (entry_index >= indices.get<1>()) {
+            table->add_section({ indices.get<0>(), indices.get<1>(), entries });
+            entries.clear();
+            subsection_index++;
+        }
+    }
+
+    m_trailer = dict;
+
+    return table;
+}
+
 PDFErrorOr<NonnullRefPtr<XRefTable>> DocumentParser::parse_xref_table()
 {
-    if (!m_reader.matches("xref"))
-        return error("Expected \"xref\"");
+    if (!m_reader.matches("xref")) {
+        // Since version 1.5, there may be a cross-reference stream instead
+        return parse_xref_stream();
+    }
+
     m_reader.move_by(4);
     if (!m_reader.consume_eol())
         return error("Expected newline after \"xref\"");
diff --git a/Userland/Libraries/LibPDF/DocumentParser.h b/Userland/Libraries/LibPDF/DocumentParser.h
index fef5f40ad4..6b6814bb75 100644
--- a/Userland/Libraries/LibPDF/DocumentParser.h
+++ b/Userland/Libraries/LibPDF/DocumentParser.h
@@ -82,6 +82,7 @@ private:
     PDFErrorOr<void> initialize_hint_tables();
     PDFErrorOr<PageOffsetHintTable> parse_page_offset_hint_table(ReadonlyBytes hint_stream_bytes);
     Vector<PageOffsetHintTableEntry> parse_all_page_offset_hint_table_entries(PageOffsetHintTable const&, ReadonlyBytes hint_stream_bytes);
+    PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_stream();
     PDFErrorOr<NonnullRefPtr<XRefTable>> parse_xref_table();
     PDFErrorOr<NonnullRefPtr<DictObject>> parse_file_trailer();
 
diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h
index 01c77197db..8c5abe14c9 100644
--- a/Userland/Libraries/LibPDF/XRefTable.h
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@@ -19,6 +19,7 @@ struct XRefEntry {
     long byte_offset { invalid_byte_offset };
     u16 generation_number { 0 };
     bool in_use { false };
+    bool compressed { false };
 };
 
 struct XRefSection {
@@ -77,18 +78,34 @@ public:
         return m_entries[index].byte_offset;
     }
 
+    [[nodiscard]] ALWAYS_INLINE long object_stream_for_object(size_t index) const
+    {
+        return byte_offset_for_object(index);
+    }
+
     [[nodiscard]] ALWAYS_INLINE u16 generation_number_for_object(size_t index) const
     {
         VERIFY(has_object(index));
         return m_entries[index].generation_number;
     }
 
+    [[nodiscard]] ALWAYS_INLINE u16 object_stream_index_for_object(size_t index) const
+    {
+        return generation_number_for_object(index);
+    }
+
     [[nodiscard]] ALWAYS_INLINE bool is_object_in_use(size_t index) const
     {
         VERIFY(has_object(index));
         return m_entries[index].in_use;
     }
 
+    [[nodiscard]] ALWAYS_INLINE bool is_object_compressed(size_t index) const
+    {
+        VERIFY(has_object(index));
+        return m_entries[index].compressed;
+    }
+
 private:
     friend struct AK::Formatter<PDF::XRefTable>;
author	Julian Offenhäuser <metalvoidzz@gmail.com>	2022-08-15 12:04:59 +0200
committer	Sam Atkins <atkinssj@gmail.com>	2022-09-17 10:07:14 +0100
commit	f9beff7b5e6adad71922fc080a89b27f45d544ea (patch)
tree	3025c844a2e7d7c712d010d9a31c2c3a31dd93a0
parent	4887aacec7ee4d659e24a3a16ef48b6b42da16ed (diff)
download	serenity-f9beff7b5e6adad71922fc080a89b27f45d544ea.zip