summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF
diff options
context:
space:
mode:
authorMatthew Olsson <matthewcolsson@gmail.com>2021-05-26 22:52:05 -0700
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2021-06-12 22:45:01 +0430
commite23bfd72521258921978134bfef88285dcb2f9ec (patch)
tree80a1f5fbb9047f48ba7d702cc70a49e6f4c9b502 /Userland/Libraries/LibPDF
parentbe1be47613d4c9357df9c172eb45c532650035a5 (diff)
downloadserenity-e23bfd72521258921978134bfef88285dcb2f9ec.zip
LibPDF: Parse linearized PDF files
This is a big step, as most PDFs which are downloaded online will be linearized. Pretty much the only difference is that the xref structure is slightly different.
Diffstat (limited to 'Userland/Libraries/LibPDF')
-rw-r--r--Userland/Libraries/LibPDF/CommonNames.h9
-rw-r--r--Userland/Libraries/LibPDF/Document.h1
-rw-r--r--Userland/Libraries/LibPDF/Object.h10
-rw-r--r--Userland/Libraries/LibPDF/Parser.cpp213
-rw-r--r--Userland/Libraries/LibPDF/Parser.h25
-rw-r--r--Userland/Libraries/LibPDF/Reader.h9
-rw-r--r--Userland/Libraries/LibPDF/Value.h16
-rw-r--r--Userland/Libraries/LibPDF/XRefTable.h32
8 files changed, 270 insertions, 45 deletions
diff --git a/Userland/Libraries/LibPDF/CommonNames.h b/Userland/Libraries/LibPDF/CommonNames.h
index 7b338db8b9..fe4010dc7e 100644
--- a/Userland/Libraries/LibPDF/CommonNames.h
+++ b/Userland/Libraries/LibPDF/CommonNames.h
@@ -20,6 +20,7 @@
V(Crypt) \
V(DCTDecode) \
V(Dest) \
+ V(E) \
V(F) \
V(Filter) \
V(First) \
@@ -32,21 +33,29 @@
V(FitV) \
V(FlateDecode) \
V(Font) \
+ V(H) \
V(JBIG2Decode) \
V(JPXDecode) \
V(Kids) \
+ V(L) \
V(LZWDecode) \
V(Last) \
V(Length) \
+ V(Linearized) \
V(MediaBox) \
+ V(N) \
V(Next) \
+ V(O) \
V(Outlines) \
+ V(P) \
V(Pages) \
V(Parent) \
+ V(Prev) \
V(Resources) \
V(Root) \
V(Rotate) \
V(RunLengthDecode) \
+ V(T) \
V(Title) \
V(Type) \
V(UserUnit) \
diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h
index 270c8c5b44..3cf5806634 100644
--- a/Userland/Libraries/LibPDF/Document.h
+++ b/Userland/Libraries/LibPDF/Document.h
@@ -12,7 +12,6 @@
#include <LibGfx/Color.h>
#include <LibPDF/Object.h>
#include <LibPDF/Parser.h>
-#include <LibPDF/XRefTable.h>
namespace PDF {
diff --git a/Userland/Libraries/LibPDF/Object.h b/Userland/Libraries/LibPDF/Object.h
index 048ef09f8d..c13a0e4d8d 100644
--- a/Userland/Libraries/LibPDF/Object.h
+++ b/Userland/Libraries/LibPDF/Object.h
@@ -124,11 +124,17 @@ public:
[[nodiscard]] ALWAYS_INLINE const HashMap<FlyString, Value>& map() const { return m_map; }
- ALWAYS_INLINE bool contains(const FlyString& key) const { return m_map.contains(key); }
+ template<typename... Args>
+ bool contains(Args&&... keys) const { return (m_map.contains(keys) && ...); }
ALWAYS_INLINE Optional<Value> get(const FlyString& key) const { return m_map.get(key); }
- Value get_value(const FlyString& key) const { return get(key).value(); }
+ Value get_value(const FlyString& key) const
+ {
+ auto value = get(key);
+ VERIFY(value.has_value());
+ return value.value();
+ }
NonnullRefPtr<Object> get_object(Document*, const FlyString& key) const;
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
index 858ab7b195..4ed748cd36 100644
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -43,37 +43,33 @@ bool Parser::initialize()
if (!parse_header())
return {};
- m_reader.move_to(m_reader.bytes().size() - 1);
- if (!navigate_to_before_eof_marker())
- return false;
- if (!navigate_to_after_startxref())
- return false;
- if (m_reader.done())
- return false;
+ if (!initialize_linearization_dict())
+ return {};
- m_reader.set_reading_forwards();
- auto xref_offset_value = parse_number();
- if (!xref_offset_value.is_int())
- return false;
- auto xref_offset = xref_offset_value.as_int();
+ bool is_linearized = m_linearization_dictionary.has_value();
+ if (is_linearized) {
+ // The file may have been linearized at one point, but could have been updated afterwards,
+ // which means it is no longer a linearized PDF file.
+ is_linearized = is_linearized && m_linearization_dictionary.value().length_of_file == m_reader.bytes().size();
+
+ if (!is_linearized) {
+ // FIXME: The file shouldn't be treated as linearized, yet the xref tables are still
+ // split. This might take some tweaking to ensure correct behavior, which can be
+ // implemented later.
+ TODO();
+ }
+ }
- m_reader.move_to(xref_offset);
- auto xref_table = parse_xref_table();
- if (!xref_table.has_value())
- return false;
- auto trailer = parse_file_trailer();
- if (!trailer)
- return false;
+ if (is_linearized)
+ return initialize_linearized_xref_table();
- m_xref_table = xref_table.value();
- m_trailer = trailer;
- return true;
+ return initialize_non_linearized_xref_table();
}
Value Parser::parse_object_with_index(u32 index)
{
- VERIFY(m_xref_table.has_object(index));
- auto byte_offset = m_xref_table.byte_offset_for_object(index);
+ VERIFY(m_xref_table->has_object(index));
+ auto byte_offset = m_xref_table->byte_offset_for_object(index);
m_reader.move_to(byte_offset);
auto indirect_value = parse_indirect_value();
VERIFY(indirect_value);
@@ -115,7 +111,135 @@ bool Parser::parse_header()
return true;
}
-Optional<XRefTable> Parser::parse_xref_table()
+bool Parser::initialize_linearization_dict()
+{
+ // parse_header() is called immediately before this, so we are at the right location
+ auto dict_value = m_document->resolve(parse_indirect_value());
+ if (!dict_value || !dict_value.is_object())
+ return false;
+
+ auto dict_object = dict_value.as_object();
+ if (!dict_object->is_dict())
+ return false;
+
+ auto dict = object_cast<DictObject>(dict_object);
+
+ if (!dict->contains(CommonNames::L, CommonNames::H, CommonNames::O, CommonNames::E, CommonNames::N, CommonNames::T))
+ return true;
+
+ auto length_of_file = dict->get_value(CommonNames::L);
+ auto hint_table = dict->get_value(CommonNames::H);
+ auto first_page_object_number = dict->get_value(CommonNames::O);
+ auto offset_of_first_page_end = dict->get_value(CommonNames::E);
+ auto number_of_pages = dict->get_value(CommonNames::N);
+ auto offset_of_main_xref_table = dict->get_value(CommonNames::T);
+ auto first_page = dict->get(CommonNames::P).value_or({});
+
+ // Validation
+ if (!length_of_file.is_int_type<u32>()
+ || !hint_table.is_object()
+ || !first_page_object_number.is_int_type<u32>()
+ || !number_of_pages.is_int_type<u16>()
+ || !offset_of_main_xref_table.is_int_type<u32>()
+ || (first_page && !first_page.is_int_type<u32>())) {
+ return true;
+ }
+
+ auto hint_table_object = hint_table.as_object();
+ if (!hint_table_object->is_array())
+ return true;
+
+ auto hint_table_array = object_cast<ArrayObject>(hint_table_object);
+ auto hint_table_size = hint_table_array->size();
+ if (hint_table_size != 2 && hint_table_size != 4)
+ return true;
+
+ auto primary_hint_stream_offset = hint_table_array->at(0);
+ auto primary_hint_stream_length = hint_table_array->at(1);
+ Value overflow_hint_stream_offset;
+ Value overflow_hint_stream_length;
+
+ if (hint_table_size == 4) {
+ overflow_hint_stream_offset = hint_table_array->at(2);
+ overflow_hint_stream_length = hint_table_array->at(3);
+ }
+
+ if (!primary_hint_stream_offset.is_int_type<u32>()
+ || !primary_hint_stream_length.is_int_type<u32>()
+ || (overflow_hint_stream_offset && !overflow_hint_stream_offset.is_int_type<u32>())
+ || (overflow_hint_stream_length && !overflow_hint_stream_length.is_int_type<u32>())) {
+ return true;
+ }
+
+ m_linearization_dictionary = LinearizationDictionary {
+ length_of_file.as_int_type<u32>(),
+ primary_hint_stream_offset.as_int_type<u32>(),
+ primary_hint_stream_length.as_int_type<u32>(),
+ overflow_hint_stream_offset ? overflow_hint_stream_offset.as_int_type<u32>() : NumericLimits<u32>::max(),
+ overflow_hint_stream_length ? overflow_hint_stream_length.as_int_type<u32>() : NumericLimits<u32>::max(),
+ first_page_object_number.as_int_type<u32>(),
+ offset_of_first_page_end.as_int_type<u32>(),
+ number_of_pages.as_int_type<u16>(),
+ offset_of_main_xref_table.as_int_type<u32>(),
+ first_page ? first_page.as_int_type<u32>() : NumericLimits<u32>::max(),
+ };
+
+ return true;
+}
+
+bool Parser::initialize_linearized_xref_table()
+{
+ // The linearization parameter dictionary has just been parsed, and the xref table
+ // comes immediately after it. We are in the correct spot.
+ if (!m_reader.matches("xref"))
+ return false;
+
+ m_xref_table = parse_xref_table();
+ if (!m_xref_table)
+ return false;
+
+ m_trailer = parse_file_trailer();
+ if (!m_trailer)
+ return false;
+
+ // Also parse the main xref table and merge into the first-page xref table. Note
+ // that we don't use the main xref table offset from the linearization dict because
+ // for some reason, it specified the offset of the whitespace after the object
+ // index start and length? So it's much easier to do it this way.
+ auto main_xref_table_offset = m_trailer->get_value(CommonNames::Prev).to_int();
+ m_reader.move_to(main_xref_table_offset);
+ auto main_xref_table = parse_xref_table();
+ if (!main_xref_table)
+ return false;
+
+ return m_xref_table->merge(move(*main_xref_table));
+}
+
+bool Parser::initialize_non_linearized_xref_table()
+{
+ m_reader.move_to(m_reader.bytes().size() - 1);
+ if (!navigate_to_before_eof_marker())
+ return false;
+ if (!navigate_to_after_startxref())
+ return false;
+ if (m_reader.done())
+ return false;
+
+ m_reader.set_reading_forwards();
+ auto xref_offset_value = parse_number();
+ if (!xref_offset_value.is_int())
+ return false;
+ auto xref_offset = xref_offset_value.as_int();
+
+ m_reader.move_to(xref_offset);
+ m_xref_table = parse_xref_table();
+ if (!m_xref_table)
+ return false;
+ m_trailer = parse_file_trailer();
+ return m_trailer;
+}
+
+RefPtr<XRefTable> Parser::parse_xref_table()
{
if (!m_reader.matches("xref"))
return {};
@@ -123,11 +247,11 @@ Optional<XRefTable> Parser::parse_xref_table()
if (!consume_eol())
return {};
- XRefTable table;
+ auto table = adopt_ref(*new XRefTable());
while (true) {
if (m_reader.matches("trailer"))
- break;
+ return table;
Vector<XRefEntry> entries;
@@ -170,10 +294,8 @@ Optional<XRefTable> Parser::parse_xref_table()
entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
}
- table.add_section({ starting_index, object_count, entries });
+ table->add_section({ starting_index, object_count, entries });
}
-
- return table;
}
RefPtr<DictObject> Parser::parse_file_trailer()
@@ -195,9 +317,9 @@ RefPtr<DictObject> Parser::parse_file_trailer()
VERIFY(consume_eol());
if (!m_reader.matches("%%EOF"))
return {};
+
m_reader.move_by(5);
consume_whitespace();
-
return dict;
}
@@ -658,8 +780,8 @@ RefPtr<DictObject> Parser::conditionally_parse_page_tree_node(u32 object_index,
{
ok = true;
- VERIFY(m_xref_table.has_object(object_index));
- auto byte_offset = m_xref_table.byte_offset_for_object(object_index);
+ VERIFY(m_xref_table->has_object(object_index));
+ auto byte_offset = m_xref_table->byte_offset_for_object(object_index);
m_reader.move_to(byte_offset);
parse_number();
@@ -866,3 +988,28 @@ bool Parser::consume(char ch)
}
}
+
+namespace AK {
+
+template<>
+struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
+ void format(FormatBuilder& format_builder, const PDF::Parser::LinearizationDictionary& dict)
+ {
+ StringBuilder builder;
+ builder.append("{\n");
+ builder.appendff(" length_of_file={}\n", dict.length_of_file);
+ builder.appendff(" primary_hint_stream_offset={}\n", dict.primary_hint_stream_offset);
+ builder.appendff(" primary_hint_stream_length={}\n", dict.primary_hint_stream_length);
+ builder.appendff(" overflow_hint_stream_offset={}\n", dict.overflow_hint_stream_offset);
+ builder.appendff(" overflow_hint_stream_length={}\n", dict.overflow_hint_stream_length);
+ builder.appendff(" first_page_object_number={}\n", dict.first_page_object_number);
+ builder.appendff(" offset_of_first_page_end={}\n", dict.offset_of_first_page_end);
+ builder.appendff(" number_of_pages={}\n", dict.number_of_pages);
+ builder.appendff(" offset_of_main_xref_table={}\n", dict.offset_of_main_xref_table);
+ builder.appendff(" first_page={}\n", dict.first_page);
+ builder.append('}');
+ Formatter<StringView>::format(format_builder, builder.to_string());
+ }
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h
index 6227898a1d..abcc4dad05 100644
--- a/Userland/Libraries/LibPDF/Parser.h
+++ b/Userland/Libraries/LibPDF/Parser.h
@@ -37,10 +37,28 @@ public:
RefPtr<DictObject> conditionally_parse_page_tree_node(u32 object_index, bool& ok);
private:
+ struct LinearizationDictionary {
+ u32 length_of_file { 0 };
+ u32 primary_hint_stream_offset { 0 };
+ u32 primary_hint_stream_length { 0 };
+ u32 overflow_hint_stream_offset { 0 };
+ u32 overflow_hint_stream_length { 0 };
+ u32 first_page_object_number { 0 };
+ u32 offset_of_first_page_end { 0 };
+ u16 number_of_pages { 0 };
+ u32 offset_of_main_xref_table { 0 };
+ u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
+ };
+
+ friend struct AK::Formatter<LinearizationDictionary>;
+
explicit Parser(const ReadonlyBytes&);
bool parse_header();
- Optional<XRefTable> parse_xref_table();
+ bool initialize_linearization_dict();
+ bool initialize_linearized_xref_table();
+ bool initialize_non_linearized_xref_table();
+ RefPtr<XRefTable> parse_xref_table();
RefPtr<DictObject> parse_file_trailer();
bool navigate_to_before_eof_marker();
@@ -85,8 +103,9 @@ private:
Reader m_reader;
RefPtr<Document> m_document;
- XRefTable m_xref_table;
+ RefPtr<XRefTable> m_xref_table;
RefPtr<DictObject> m_trailer;
+ Optional<LinearizationDictionary> m_linearization_dictionary;
};
-}
+};
diff --git a/Userland/Libraries/LibPDF/Reader.h b/Userland/Libraries/LibPDF/Reader.h
index 2ba1a475b6..9459340792 100644
--- a/Userland/Libraries/LibPDF/Reader.h
+++ b/Userland/Libraries/LibPDF/Reader.h
@@ -48,11 +48,12 @@ public:
}
}
- char read()
+ template<typename T = char>
+ T read()
{
- auto value = m_bytes.at(m_offset);
- move_by(1);
- return static_cast<char>(value);
+ T value = reinterpret_cast<const T*>(m_bytes.offset(m_offset))[0];
+ move_by(sizeof(T));
+ return value;
}
char peek(size_t shift = 0) const
diff --git a/Userland/Libraries/LibPDF/Value.h b/Userland/Libraries/LibPDF/Value.h
index 432be4750e..d26ac45d71 100644
--- a/Userland/Libraries/LibPDF/Value.h
+++ b/Userland/Libraries/LibPDF/Value.h
@@ -109,6 +109,22 @@ public:
return m_as_int;
}
+ template<typename T>
+ [[nodiscard]] ALWAYS_INLINE bool is_int_type() const
+ {
+ if (!is_int())
+ return false;
+ auto as_int = static_cast<T>(m_as_int);
+ return as_int >= NumericLimits<T>::min() && as_int <= NumericLimits<T>::max();
+ }
+
+ template<typename T>
+ [[nodiscard]] ALWAYS_INLINE T as_int_type() const
+ {
+ VERIFY(is_int_type<T>());
+ return static_cast<T>(m_as_int);
+ }
+
[[nodiscard]] ALWAYS_INLINE int to_int() const
{
if (is_int())
diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h
index fe084218a3..3cfb617dc8 100644
--- a/Userland/Libraries/LibPDF/XRefTable.h
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@@ -10,8 +10,10 @@
namespace PDF {
+constexpr long invalid_byte_offset = NumericLimits<long>::max();
+
struct XRefEntry {
- long byte_offset { -1L };
+ long byte_offset { invalid_byte_offset };
u16 generation_number { 0 };
bool in_use { false };
};
@@ -22,8 +24,34 @@ struct XRefSection {
Vector<XRefEntry> entries;
};
-class XRefTable {
+class XRefTable final : public RefCounted<XRefTable> {
public:
+ bool merge(XRefTable&& other)
+ {
+ auto this_size = m_entries.size();
+ auto other_size = other.m_entries.size();
+ m_entries.ensure_capacity(other_size);
+
+ for (size_t i = 0; i < other_size; i++) {
+ auto other_entry = other.m_entries[i];
+ if (i >= this_size) {
+ m_entries.unchecked_append(other_entry);
+ continue;
+ }
+
+ auto this_entry = m_entries[i];
+
+ if (this_entry.byte_offset == invalid_byte_offset) {
+ m_entries[i] = other_entry;
+ } else if (other_entry.byte_offset != invalid_byte_offset) {
+ // Both xref tables have an entry for the same object index
+ return false;
+ }
+ }
+
+ return true;
+ }
+
void add_section(const XRefSection& section)
{
m_entries.ensure_capacity(section.starting_index + section.count);