summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF
diff options
context:
space:
mode:
authorMatthew Olsson <matthewcolsson@gmail.com>2021-05-26 23:08:06 -0700
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2021-06-12 22:45:01 +0430
commitea3abb14fe2eff0ef5d319c2d83358fbb233e4f5 (patch)
treed3d1a80017135608695d694c813941edc771a0b5 /Userland/Libraries/LibPDF
parente23bfd72521258921978134bfef88285dcb2f9ec (diff)
downloadserenity-ea3abb14fe2eff0ef5d319c2d83358fbb233e4f5.zip
LibPDF: Parse hint tables
This code isn't _actually_ used as of right now, but I wrote it at the same time as all of the code in the previous commit. I realized after I wrote it that these hint tables aren't super useful if the parser already has access to the full file. However, this will be useful if we ever want to stream PDFs from the web (and possibly view them in the browser).
Diffstat (limited to 'Userland/Libraries/LibPDF')
-rw-r--r--Userland/Libraries/LibPDF/Parser.cpp222
-rw-r--r--Userland/Libraries/LibPDF/Parser.h31
2 files changed, 253 insertions, 0 deletions
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
index 4ed748cd36..17f678172b 100644
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -4,6 +4,8 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
+#include <AK/BitStream.h>
+#include <AK/MemoryStream.h>
#include <AK/ScopeGuard.h>
#include <AK/TypeCasts.h>
#include <LibPDF/CommonNames.h>
@@ -215,6 +217,71 @@ bool Parser::initialize_linearized_xref_table()
return m_xref_table->merge(move(*main_xref_table));
}
+bool Parser::initialize_hint_tables()
+{
+ auto linearization_dict = m_linearization_dictionary.value();
+ auto primary_offset = linearization_dict.primary_hint_stream_offset;
+ auto overflow_offset = linearization_dict.overflow_hint_stream_offset;
+
+ auto parse_hint_table = [&](size_t offset) -> RefPtr<StreamObject> {
+ m_reader.move_to(offset);
+ auto stream_indirect_value = parse_indirect_value();
+ if (!stream_indirect_value)
+ return {};
+
+ auto stream_value = stream_indirect_value->value();
+ if (!stream_value.is_object())
+ return {};
+
+ auto stream_object = stream_value.as_object();
+ if (!stream_object->is_stream())
+ return {};
+
+ return object_cast<StreamObject>(stream_object);
+ };
+
+ auto primary_hint_stream = parse_hint_table(primary_offset);
+ if (!primary_hint_stream)
+ return false;
+
+ RefPtr<StreamObject> overflow_hint_stream;
+ if (overflow_offset != NumericLimits<u32>::max())
+ overflow_hint_stream = parse_hint_table(overflow_offset);
+
+ ByteBuffer possible_merged_stream_buffer;
+ ReadonlyBytes hint_stream_bytes;
+
+ if (overflow_hint_stream) {
+ auto primary_size = primary_hint_stream->bytes().size();
+ auto overflow_size = overflow_hint_stream->bytes().size();
+ auto total_size = primary_size + overflow_size;
+
+ possible_merged_stream_buffer = ByteBuffer::create_uninitialized(total_size);
+ possible_merged_stream_buffer.append(primary_hint_stream->bytes());
+ possible_merged_stream_buffer.append(overflow_hint_stream->bytes());
+ hint_stream_bytes = possible_merged_stream_buffer.bytes();
+ } else {
+ hint_stream_bytes = primary_hint_stream->bytes();
+ }
+
+ auto hint_table = parse_page_offset_hint_table(hint_stream_bytes);
+ if (!hint_table.has_value())
+ return false;
+
+ dbgln("hint table: {}", hint_table.value());
+
+ auto hint_table_entries = parse_all_page_offset_hint_table_entries(hint_table.value(), hint_stream_bytes);
+ if (!hint_table_entries.has_value())
+ return false;
+
+ auto entries = hint_table_entries.value();
+ dbgln("hint table entries size: {}", entries.size());
+ for (auto& entry : entries)
+ dbgln("{}", entry);
+
+ return true;
+}
+
bool Parser::initialize_non_linearized_xref_table()
{
m_reader.move_to(m_reader.bytes().size() - 1);
@@ -323,6 +390,113 @@ RefPtr<DictObject> Parser::parse_file_trailer()
return dict;
}
+Optional<Parser::PageOffsetHintTable> Parser::parse_page_offset_hint_table(const ReadonlyBytes& hint_stream_bytes)
+{
+ if (hint_stream_bytes.size() < sizeof(PageOffsetHintTable))
+ return {};
+
+ size_t offset = 0;
+
+ auto read_u32 = [&] {
+ u32 data = reinterpret_cast<const u32*>(hint_stream_bytes.data() + offset)[0];
+ offset += 4;
+ return AK::convert_between_host_and_big_endian(data);
+ };
+
+ auto read_u16 = [&] {
+ u16 data = reinterpret_cast<const u16*>(hint_stream_bytes.data() + offset)[0];
+ offset += 2;
+ return AK::convert_between_host_and_big_endian(data);
+ };
+
+ PageOffsetHintTable hint_table {
+ read_u32(),
+ read_u32(),
+ read_u16(),
+ read_u32(),
+ read_u16(),
+ read_u32(),
+ read_u16(),
+ read_u32(),
+ read_u16(),
+ read_u16(),
+ read_u16(),
+ read_u16(),
+ read_u16(),
+ };
+
+ // Verify that all of the bits_required_for_xyz fields are <= 32, since all of the numeric
+ // fields in PageOffsetHintTableEntry are u32
+ VERIFY(hint_table.bits_required_for_object_number <= 32);
+ VERIFY(hint_table.bits_required_for_page_length <= 32);
+ VERIFY(hint_table.bits_required_for_content_stream_offsets <= 32);
+ VERIFY(hint_table.bits_required_for_content_stream_length <= 32);
+ VERIFY(hint_table.bits_required_for_number_of_shared_obj_refs <= 32);
+ VERIFY(hint_table.bits_required_for_greatest_shared_obj_identifier <= 32);
+ VERIFY(hint_table.bits_required_for_fraction_numerator <= 32);
+
+ return hint_table;
+}
+
+Optional<Vector<Parser::PageOffsetHintTableEntry>> Parser::parse_all_page_offset_hint_table_entries(const PageOffsetHintTable& hint_table, const ReadonlyBytes& hint_stream_bytes)
+{
+ InputMemoryStream input_stream(hint_stream_bytes);
+ input_stream.seek(sizeof(PageOffsetHintTable));
+ if (input_stream.has_any_error())
+ return {};
+
+ InputBitStream bit_stream(input_stream);
+
+ auto number_of_pages = m_linearization_dictionary.value().number_of_pages;
+ Vector<PageOffsetHintTableEntry> entries;
+ for (size_t i = 0; i < number_of_pages; i++)
+ entries.append(PageOffsetHintTableEntry {});
+
+ auto bits_required_for_object_number = hint_table.bits_required_for_object_number;
+ auto bits_required_for_page_length = hint_table.bits_required_for_page_length;
+ auto bits_required_for_content_stream_offsets = hint_table.bits_required_for_content_stream_offsets;
+ auto bits_required_for_content_stream_length = hint_table.bits_required_for_content_stream_length;
+ auto bits_required_for_number_of_shared_obj_refs = hint_table.bits_required_for_number_of_shared_obj_refs;
+ auto bits_required_for_greatest_shared_obj_identifier = hint_table.bits_required_for_greatest_shared_obj_identifier;
+ auto bits_required_for_fraction_numerator = hint_table.bits_required_for_fraction_numerator;
+
+ auto parse_int_entry = [&](u32 PageOffsetHintTableEntry::*field, u32 bit_size) {
+ if (bit_size <= 0)
+ return;
+
+ for (int i = 0; i < number_of_pages; i++) {
+ auto& entry = entries[i];
+ entry.*field = bit_stream.read_bits(bit_size);
+ }
+ };
+
+ auto parse_vector_entry = [&](Vector<u32> PageOffsetHintTableEntry::*field, u32 bit_size) {
+ if (bit_size <= 0)
+ return;
+
+ for (int page = 1; page < number_of_pages; page++) {
+ auto number_of_shared_objects = entries[page].number_of_shared_objects;
+ Vector<u32> items;
+ items.ensure_capacity(number_of_shared_objects);
+
+ for (size_t i = 0; i < number_of_shared_objects; i++)
+ items.unchecked_append(bit_stream.read_bits(bit_size));
+
+ entries[page].*field = move(items);
+ }
+ };
+
+ parse_int_entry(&PageOffsetHintTableEntry::objects_in_page_number, bits_required_for_object_number);
+ parse_int_entry(&PageOffsetHintTableEntry::page_length_number, bits_required_for_page_length);
+ parse_int_entry(&PageOffsetHintTableEntry::number_of_shared_objects, bits_required_for_number_of_shared_obj_refs);
+ parse_vector_entry(&PageOffsetHintTableEntry::shared_object_identifiers, bits_required_for_greatest_shared_obj_identifier);
+ parse_vector_entry(&PageOffsetHintTableEntry::shared_object_location_numerators, bits_required_for_fraction_numerator);
+ parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_offset_number, bits_required_for_content_stream_offsets);
+ parse_int_entry(&PageOffsetHintTableEntry::page_content_stream_length_number, bits_required_for_content_stream_length);
+
+ return entries;
+}
+
bool Parser::navigate_to_before_eof_marker()
{
m_reader.set_reading_backwards();
@@ -1012,4 +1186,52 @@ struct Formatter<PDF::Parser::LinearizationDictionary> : Formatter<StringView> {
}
};
+template<>
+struct Formatter<PDF::Parser::PageOffsetHintTable> : Formatter<StringView> {
+ void format(FormatBuilder& format_builder, const PDF::Parser::PageOffsetHintTable& table)
+ {
+ StringBuilder builder;
+ builder.append("{\n");
+ builder.appendff(" least_number_of_objects_in_a_page={}\n", table.least_number_of_objects_in_a_page);
+ builder.appendff(" location_of_first_page_object={}\n", table.location_of_first_page_object);
+ builder.appendff(" bits_required_for_object_number={}\n", table.bits_required_for_object_number);
+ builder.appendff(" least_length_of_a_page={}\n", table.least_length_of_a_page);
+ builder.appendff(" bits_required_for_page_length={}\n", table.bits_required_for_page_length);
+ builder.appendff(" least_offset_of_any_content_stream={}\n", table.least_offset_of_any_content_stream);
+ builder.appendff(" bits_required_for_content_stream_offsets={}\n", table.bits_required_for_content_stream_offsets);
+ builder.appendff(" least_content_stream_length={}\n", table.least_content_stream_length);
+ builder.appendff(" bits_required_for_content_stream_length={}\n", table.bits_required_for_content_stream_length);
+ builder.appendff(" bits_required_for_number_of_shared_obj_refs={}\n", table.bits_required_for_number_of_shared_obj_refs);
+ builder.appendff(" bits_required_for_greatest_shared_obj_identifier={}\n", table.bits_required_for_greatest_shared_obj_identifier);
+ builder.appendff(" bits_required_for_fraction_numerator={}\n", table.bits_required_for_fraction_numerator);
+ builder.appendff(" shared_object_reference_fraction_denominator={}\n", table.shared_object_reference_fraction_denominator);
+ builder.append('}');
+ Formatter<StringView>::format(format_builder, builder.to_string());
+ }
+};
+
+template<>
+struct Formatter<PDF::Parser::PageOffsetHintTableEntry> : Formatter<StringView> {
+ void format(FormatBuilder& format_builder, const PDF::Parser::PageOffsetHintTableEntry& entry)
+ {
+ StringBuilder builder;
+ builder.append("{\n");
+ builder.appendff(" objects_in_page_number={}\n", entry.objects_in_page_number);
+ builder.appendff(" page_length_number={}\n", entry.page_length_number);
+ builder.appendff(" number_of_shared_objects={}\n", entry.number_of_shared_objects);
+ builder.append(" shared_object_identifiers=[");
+ for (auto& identifier : entry.shared_object_identifiers)
+ builder.appendff(" {}", identifier);
+ builder.append(" ]\n");
+ builder.append(" shared_object_location_numerators=[");
+ for (auto& numerator : entry.shared_object_location_numerators)
+ builder.appendff(" {}", numerator);
+ builder.append(" ]\n");
+ builder.appendff(" page_content_stream_offset_number={}\n", entry.page_content_stream_offset_number);
+ builder.appendff(" page_content_stream_length_number={}\n", entry.page_content_stream_length_number);
+ builder.append('}');
+ Formatter<StringView>::format(format_builder, builder.to_string());
+ }
+};
+
}
diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h
index abcc4dad05..e1a0912463 100644
--- a/Userland/Libraries/LibPDF/Parser.h
+++ b/Userland/Libraries/LibPDF/Parser.h
@@ -50,7 +50,35 @@ private:
u32 first_page { 0 }; // The page to initially open (I think, the spec isn't all that clear here)
};
+ struct PageOffsetHintTable {
+ u32 least_number_of_objects_in_a_page { 0 };
+ u32 location_of_first_page_object { 0 };
+ u16 bits_required_for_object_number { 0 };
+ u32 least_length_of_a_page { 0 };
+ u16 bits_required_for_page_length { 0 };
+ u32 least_offset_of_any_content_stream { 0 };
+ u16 bits_required_for_content_stream_offsets { 0 };
+ u32 least_content_stream_length { 0 };
+ u16 bits_required_for_content_stream_length { 0 };
+ u16 bits_required_for_number_of_shared_obj_refs { 0 };
+ u16 bits_required_for_greatest_shared_obj_identifier { 0 };
+ u16 bits_required_for_fraction_numerator { 0 };
+ u16 shared_object_reference_fraction_denominator { 0 };
+ };
+
+ struct PageOffsetHintTableEntry {
+ u32 objects_in_page_number { 0 };
+ u32 page_length_number { 0 };
+ u32 number_of_shared_objects { 0 };
+ Vector<u32> shared_object_identifiers {};
+ Vector<u32> shared_object_location_numerators {};
+ u32 page_content_stream_offset_number { 0 };
+ u32 page_content_stream_length_number { 0 };
+ };
+
friend struct AK::Formatter<LinearizationDictionary>;
+ friend struct AK::Formatter<PageOffsetHintTable>;
+ friend struct AK::Formatter<PageOffsetHintTableEntry>;
explicit Parser(const ReadonlyBytes&);
@@ -58,6 +86,9 @@ private:
bool initialize_linearization_dict();
bool initialize_linearized_xref_table();
bool initialize_non_linearized_xref_table();
+ bool initialize_hint_tables();
+ Optional<PageOffsetHintTable> parse_page_offset_hint_table(const ReadonlyBytes& hint_stream_bytes);
+ Optional<Vector<PageOffsetHintTableEntry>> parse_all_page_offset_hint_table_entries(const PageOffsetHintTable&, const ReadonlyBytes& hint_stream_bytes);
RefPtr<XRefTable> parse_xref_table();
RefPtr<DictObject> parse_file_trailer();