7 files changed, 1008 insertions, 0 deletions
diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt
index 15e1562f76..c184d28653 100644
--- a/Userland/Libraries/LibPDF/CMakeLists.txt
+++ b/Userland/Libraries/LibPDF/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(SOURCES
     Object.cpp
+    Document.cpp
+    Parser.cpp
     Value.cpp
     )
 
diff --git a/Userland/Libraries/LibPDF/Document.cpp b/Userland/Libraries/LibPDF/Document.cpp
new file mode 100644
index 0000000000..4934e7a331
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Document.cpp
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibPDF/Document.h>
+#include <LibPDF/Parser.h>
+
+namespace PDF {
+
+Document::Document(const ReadonlyBytes& bytes)
+    : m_parser(Parser({}, bytes))
+{
+    VERIFY(m_parser.perform_validation());
+    auto [xref_table, trailer] = m_parser.parse_last_xref_table_and_trailer();
+
+    m_xref_table = xref_table;
+    m_trailer = trailer;
+}
+
+}
diff --git a/Userland/Libraries/LibPDF/Document.h b/Userland/Libraries/LibPDF/Document.h
new file mode 100644
index 0000000000..b218019968
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Document.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/RefCounted.h>
+#include <LibPDF/Object.h>
+#include <LibPDF/Parser.h>
+#include <LibPDF/XRefTable.h>
+
+namespace PDF {
+
+class Document final : public RefCounted<Document> {
+public:
+    explicit Document(const ReadonlyBytes& bytes);
+
+    ALWAYS_INLINE const XRefTable& xref_table() const { return m_xref_table; }
+
+    ALWAYS_INLINE const DictObject& trailer() const { return *m_trailer; }
+
+    ALWAYS_INLINE Value get_value(u32 index) const
+    {
+        return m_values.get(index).value_or({});
+    }
+
+    ALWAYS_INLINE void set_value(u32 index, const Value& value)
+    {
+        m_values.ensure_capacity(index);
+        m_values.set(index, value);
+    }
+
+private:
+    Parser m_parser;
+    XRefTable m_xref_table;
+    RefPtr<DictObject> m_trailer;
+    HashMap<u32, Value> m_values;
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
new file mode 100644
index 0000000000..12da69dcb2
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -0,0 +1,620 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/ScopeGuard.h>
+#include <AK/TypeCasts.h>
+#include <LibPDF/Document.h>
+#include <LibPDF/Parser.h>
+#include <ctype.h>
+#include <math.h>
+
+namespace PDF {
+
+template<typename T, typename... Args>
+static NonnullRefPtr<T> make_object(Args... args) requires(IsBaseOf<Object, T>)
+{
+    return adopt_ref(*new T(forward<Args>(args)...));
+}
+
+Parser::Parser(Badge<Document>, const ReadonlyBytes& bytes)
+    : m_reader(bytes)
+{
+}
+
+bool Parser::perform_validation()
+{
+    return !sloppy_is_linearized() && parse_header();
+}
+
+Parser::XRefTableAndTrailer Parser::parse_last_xref_table_and_trailer()
+{
+    m_reader.move_to(m_reader.bytes().size() - 1);
+    VERIFY(navigate_to_before_eof_marker());
+    navigate_to_after_startxref();
+    VERIFY(!m_reader.done());
+
+    m_reader.set_reading_forwards();
+    auto xref_offset_value = parse_number();
+    VERIFY(xref_offset_value.is_int());
+    auto xref_offset = xref_offset_value.as_int();
+
+    m_reader.move_to(xref_offset);
+    auto xref_table = parse_xref_table();
+    auto trailer = parse_file_trailer();
+
+    return { xref_table, trailer };
+}
+
+bool Parser::parse_header()
+{
+    // FIXME: Do something with the version?
+    m_reader.set_reading_forwards();
+    m_reader.move_to(0);
+    if (m_reader.remaining() < 8 || !m_reader.matches("%PDF-"))
+        return false;
+    m_reader.move_by(5);
+
+    char major_ver = m_reader.read();
+    if (major_ver != '1' && major_ver != '2')
+        return false;
+    if (m_reader.read() != '.')
+        return false;
+
+    char minor_ver = m_reader.read();
+    if (minor_ver < '0' || major_ver > '7')
+        return false;
+    consume_eol();
+
+    // Parse optional high-byte comment, which signifies a binary file
+    // FIXME: Do something with this?
+    auto comment = parse_comment();
+    if (!comment.is_empty()) {
+        auto binary = comment.length() >= 4;
+        if (binary) {
+            for (size_t i = 0; i < comment.length() && binary; i++)
+                binary = static_cast<u8>(comment[i]) > 128;
+        }
+    }
+
+    return true;
+}
+
+XRefTable Parser::parse_xref_table()
+{
+    VERIFY(m_reader.matches("xref"));
+    m_reader.move_by(4);
+    consume_eol();
+
+    XRefTable table;
+
+    while (true) {
+        if (m_reader.matches("trailer"))
+            break;
+
+        Vector<XRefEntry> entries;
+
+        auto starting_index_value = parse_number();
+        auto starting_index = starting_index_value.as_int();
+        auto object_count_value = parse_number();
+        auto object_count = object_count_value.as_int();
+
+        for (int i = 0; i < object_count; i++) {
+            auto offset_string = String(m_reader.bytes().slice(m_reader.offset(), 10));
+            m_reader.move_by(10);
+            consume(' ');
+
+            auto generation_string = String(m_reader.bytes().slice(m_reader.offset(), 5));
+            m_reader.move_by(5);
+            consume(' ');
+
+            auto letter = m_reader.read();
+            VERIFY(letter == 'n' || letter == 'f');
+
+            // The line ending sequence can be one of the following:
+            // SP CR, SP LF, or CR LF
+            if (m_reader.matches(' ')) {
+                consume();
+                auto ch = consume();
+                VERIFY(ch == '\r' || ch == '\n');
+            } else {
+                VERIFY(m_reader.matches("\r\n"));
+                m_reader.move_by(2);
+            }
+
+            auto offset = strtol(offset_string.characters(), nullptr, 10);
+            auto generation = strtol(generation_string.characters(), nullptr, 10);
+
+            entries.append({ offset, static_cast<u16>(generation), letter == 'n' });
+        }
+
+        table.add_section({ starting_index, object_count, entries });
+    }
+
+    return table;
+}
+
+NonnullRefPtr<DictObject> Parser::parse_file_trailer()
+{
+    VERIFY(m_reader.matches("trailer"));
+    m_reader.move_by(7);
+    consume_whitespace();
+    auto dict = parse_dict();
+
+    VERIFY(m_reader.matches("startxref"));
+    m_reader.move_by(9);
+    consume_whitespace();
+
+    m_reader.move_until([&](auto) { return matches_eol(); });
+    consume_eol();
+    VERIFY(m_reader.matches("%%EOF"));
+    m_reader.move_by(5);
+    consume_whitespace();
+    VERIFY(m_reader.done());
+
+    return dict;
+}
+
+bool Parser::navigate_to_before_eof_marker()
+{
+    m_reader.set_reading_backwards();
+
+    while (!m_reader.done()) {
+        m_reader.move_until([&](auto) { return matches_eol(); });
+        if (m_reader.done())
+            return false;
+
+        consume_eol();
+        if (!m_reader.matches("%%EOF"))
+            continue;
+
+        m_reader.move_by(5);
+        if (!matches_eol())
+            continue;
+        consume_eol();
+        return true;
+    }
+
+    return false;
+}
+
+bool Parser::navigate_to_after_startxref()
+{
+    m_reader.set_reading_backwards();
+
+    while (!m_reader.done()) {
+        m_reader.move_until([&](auto) { return matches_eol(); });
+        auto offset = m_reader.offset() + 1;
+
+        consume_eol();
+        if (!m_reader.matches("startxref"))
+            continue;
+
+        m_reader.move_by(9);
+        if (!matches_eol())
+            continue;
+
+        m_reader.move_to(offset);
+        return true;
+    }
+
+    return false;
+}
+
+bool Parser::sloppy_is_linearized()
+{
+    ScopeGuard guard([&] {
+        m_reader.move_to(0);
+        m_reader.set_reading_forwards();
+    });
+
+    auto limit = min(1024ul, m_reader.bytes().size() - 1);
+    m_reader.move_to(limit);
+    m_reader.set_reading_backwards();
+
+    while (!m_reader.done()) {
+        m_reader.move_until('/');
+        if (m_reader.matches("/Linearized"))
+            return true;
+        m_reader.move_by(1);
+    }
+
+    return false;
+}
+
+String Parser::parse_comment()
+{
+    if (!m_reader.matches('%'))
+        return {};
+
+    consume();
+    auto comment_start_offset = m_reader.offset();
+    m_reader.move_until([&] {
+        return matches_eol();
+    });
+    String str = StringView(m_reader.bytes().slice(comment_start_offset, m_reader.offset() - comment_start_offset));
+    consume_eol();
+    consume_whitespace();
+    return str;
+}
+
+Value Parser::parse_value()
+{
+    parse_comment();
+
+    if (m_reader.matches("null")) {
+        m_reader.move_by(4);
+        consume_whitespace();
+        return Value();
+    }
+
+    if (m_reader.matches("true")) {
+        m_reader.move_by(4);
+        consume_whitespace();
+        return Value(true);
+    }
+
+    if (m_reader.matches("false")) {
+        m_reader.move_by(5);
+        consume_whitespace();
+        return Value(false);
+    }
+
+    if (matches_number())
+        return parse_possible_indirect_value_or_ref();
+
+    if (m_reader.matches('/'))
+        return parse_name();
+
+    if (m_reader.matches("<<")) {
+        auto dict = parse_dict();
+        if (m_reader.matches("stream\n"))
+            return parse_stream(dict);
+        return dict;
+    }
+
+    if (m_reader.matches_any('(', '<'))
+        return parse_string();
+
+    if (m_reader.matches('['))
+        return parse_array();
+
+    dbgln("tried to parse value, but found char {} ({}) at offset {}", m_reader.peek(), static_cast<u8>(m_reader.peek()), m_reader.offset());
+    VERIFY_NOT_REACHED();
+}
+
+Value Parser::parse_possible_indirect_value_or_ref()
+{
+    auto first_number = parse_number();
+    if (!first_number.is_int() || !matches_number())
+        return first_number;
+
+    m_reader.save();
+    auto second_number = parse_number();
+    if (!second_number.is_int()) {
+        m_reader.load();
+        return first_number;
+    }
+
+    if (m_reader.matches('R')) {
+        m_reader.discard();
+        consume();
+        consume_whitespace();
+        return make_object<IndirectValueRef>(first_number.as_int(), second_number.as_int());
+    }
+
+    if (m_reader.matches("obj")) {
+        m_reader.discard();
+        return parse_indirect_value(first_number.as_int(), second_number.as_int());
+    }
+
+    m_reader.load();
+    return first_number;
+}
+
+NonnullRefPtr<IndirectValue> Parser::parse_indirect_value(int index, int generation)
+{
+    VERIFY(m_reader.matches("obj"));
+    m_reader.move_by(3);
+    if (matches_eol())
+        consume_eol();
+    auto value = parse_value();
+    VERIFY(value.is_object());
+    VERIFY(m_reader.matches("endobj"));
+    VERIFY(consume_whitespace());
+
+    return make_object<IndirectValue>(index, generation, value.as_object());
+}
+
+Value Parser::parse_number()
+{
+    size_t start_offset = m_reader.offset();
+    bool is_float = false;
+
+    if (m_reader.matches('+') || m_reader.matches('-'))
+        consume();
+
+    while (!m_reader.done()) {
+        if (m_reader.matches('.')) {
+            if (is_float)
+                break;
+            is_float = true;
+            consume();
+        } else if (isdigit(m_reader.peek())) {
+            consume();
+        } else {
+            break;
+        }
+    }
+
+    auto string = String(m_reader.bytes().slice(start_offset, m_reader.offset() - start_offset));
+    float f = strtof(string.characters(), nullptr);
+    if (is_float)
+        return Value(f);
+
+    VERIFY(floorf(f) == f);
+    consume_whitespace();
+
+    return Value(static_cast<int>(f));
+}
+
+NonnullRefPtr<NameObject> Parser::parse_name()
+{
+    consume('/');
+    StringBuilder builder;
+
+    while (true) {
+        if (matches_whitespace())
+            break;
+
+        if (m_reader.matches('#')) {
+            int hex_value = 0;
+            for (int i = 0; i < 2; i++) {
+                auto ch = consume();
+                VERIFY(isxdigit(ch));
+                hex_value *= 16;
+                if (ch <= '9') {
+                    hex_value += ch - '0';
+                } else {
+                    hex_value += ch - 'A' + 10;
+                }
+            }
+            builder.append(static_cast<char>(hex_value));
+            continue;
+        }
+
+        builder.append(consume());
+    }
+
+    consume_whitespace();
+
+    return make_object<NameObject>(builder.to_string());
+}
+
+NonnullRefPtr<StringObject> Parser::parse_string()
+{
+    ScopeGuard guard([&] { consume_whitespace(); });
+
+    if (m_reader.matches('('))
+        return make_object<StringObject>(parse_literal_string(), false);
+    return make_object<StringObject>(parse_hex_string(), true);
+}
+
+String Parser::parse_literal_string()
+{
+    consume('(');
+    StringBuilder builder;
+    auto opened_parens = 0;
+
+    while (true) {
+        if (m_reader.matches('(')) {
+            opened_parens++;
+            builder.append(consume());
+        } else if (m_reader.matches(')')) {
+            consume();
+            if (opened_parens == 0)
+                break;
+            opened_parens--;
+            builder.append(')');
+        } else if (m_reader.matches('\\')) {
+            consume();
+            if (matches_eol()) {
+                consume_eol();
+                continue;
+            }
+
+            VERIFY(!m_reader.done());
+            auto ch = consume();
+            switch (ch) {
+            case 'n':
+                builder.append('\n');
+                break;
+            case 'r':
+                builder.append('\r');
+                break;
+            case 't':
+                builder.append('\t');
+                break;
+            case 'b':
+                builder.append('\b');
+                break;
+            case 'f':
+                builder.append('\f');
+                break;
+            case '(':
+                builder.append('(');
+                break;
+            case ')':
+                builder.append(')');
+                break;
+            case '\\':
+                builder.append('\\');
+                break;
+            default: {
+                if (ch >= '0' && ch <= '7') {
+                    int octal_value = ch - '0';
+                    for (int i = 0; i < 2; i++) {
+                        auto octal_ch = consume();
+                        if (octal_ch < '0' || octal_ch > '7')
+                            break;
+                        octal_value = octal_value * 8 + (octal_ch - '0');
+                    }
+                    builder.append(static_cast<char>(octal_value));
+                } else {
+                    builder.append(ch);
+                }
+            }
+            }
+        } else if (matches_eol()) {
+            consume_eol();
+            builder.append('\n');
+        } else {
+            builder.append(consume());
+        }
+    }
+
+    VERIFY(opened_parens == 0);
+    return builder.to_string();
+}
+
+String Parser::parse_hex_string()
+{
+    consume('<');
+    StringBuilder builder;
+
+    while (true) {
+        if (m_reader.matches('>')) {
+            consume();
+            return builder.to_string();
+        } else {
+            int hex_value = 0;
+
+            for (int i = 0; i < 2; i++) {
+                auto ch = consume();
+                if (ch == '>') {
+                    // The hex string contains an odd number of characters, and the last character
+                    // is assumed to be '0'
+                    consume();
+                    hex_value *= 16;
+                    builder.append(static_cast<char>(hex_value));
+                    return builder.to_string();
+                }
+                VERIFY(isxdigit(ch));
+
+                hex_value *= 16;
+                if (ch <= '9') {
+                    hex_value += ch - '0';
+                } else {
+                    hex_value += ch - 'A' + 10;
+                }
+            }
+
+            builder.append(static_cast<char>(hex_value));
+        }
+    }
+}
+
+NonnullRefPtr<ArrayObject> Parser::parse_array()
+{
+    consume('[');
+    consume_whitespace();
+    Vector<Value> values;
+
+    while (!m_reader.matches(']'))
+        values.append(parse_value());
+
+    consume(']');
+    consume_whitespace();
+
+    return make_object<ArrayObject>(values);
+}
+
+NonnullRefPtr<DictObject> Parser::parse_dict()
+{
+    consume('<');
+    consume('<');
+    consume_whitespace();
+    HashMap<FlyString, Value> map;
+
+    while (true) {
+        if (m_reader.matches(">>"))
+            break;
+        auto name = parse_name();
+        auto value = parse_value();
+        map.set(name->name(), value);
+    }
+
+    consume('>');
+    consume('>');
+    consume_whitespace();
+
+    return make_object<DictObject>(map);
+}
+
+NonnullRefPtr<StreamObject> Parser::parse_stream(NonnullRefPtr<DictObject> dict)
+{
+    VERIFY(m_reader.matches("stream"));
+    m_reader.move_by(6);
+    consume_eol();
+
+    auto length_value = dict->map().get("Length");
+    VERIFY(length_value.has_value());
+    auto length = length_value.value();
+    VERIFY(length.is_int());
+
+    auto bytes = m_reader.bytes().slice(m_reader.offset(), length.as_int());
+
+    return make_object<StreamObject>(dict, bytes);
+}
+
+bool Parser::matches_eol() const
+{
+    return m_reader.matches_any(0xa, 0xd);
+}
+
+bool Parser::matches_whitespace() const
+{
+    return matches_eol() || m_reader.matches_any(0, 0x9, 0xc, ' ');
+}
+
+bool Parser::matches_number() const
+{
+    if (m_reader.done())
+        return false;
+    auto ch = m_reader.peek();
+    return isdigit(ch) || ch == '-' || ch == '+';
+}
+
+void Parser::consume_eol()
+{
+    if (m_reader.matches("\r\n")) {
+        consume(2);
+    } else {
+        auto consumed = consume();
+        VERIFY(consumed == 0xd || consumed == 0xa);
+    }
+}
+
+bool Parser::consume_whitespace()
+{
+    bool consumed = false;
+    while (matches_whitespace()) {
+        consumed = true;
+        consume();
+    }
+    return consumed;
+}
+
+char Parser::consume()
+{
+    return m_reader.read();
+}
+
+void Parser::consume(char ch)
+{
+    VERIFY(consume() == ch);
+}
+
+}
diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h
new file mode 100644
index 0000000000..c983628e49
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Parser.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/NonnullRefPtrVector.h>
+#include <LibPDF/Object.h>
+#include <LibPDF/Reader.h>
+#include <LibPDF/XRefTable.h>
+
+namespace PDF {
+
+class Document;
+
+class Parser {
+public:
+    Parser(Badge<Document>, const ReadonlyBytes&);
+
+    bool perform_validation();
+
+    struct XRefTableAndTrailer {
+        XRefTable xref_table;
+        NonnullRefPtr<DictObject> trailer;
+    };
+    XRefTableAndTrailer parse_last_xref_table_and_trailer();
+
+private:
+    bool parse_header();
+    XRefTable parse_xref_table();
+    NonnullRefPtr<DictObject> parse_file_trailer();
+
+    bool navigate_to_before_eof_marker();
+    bool navigate_to_after_startxref();
+
+    // If the PDF is linearized, the first object will be the linearization
+    // parameter dictionary, and it will always occur within the first 1024 bytes.
+    // We do a very sloppy and context-free search for this object. A return value
+    // of true does not necessarily mean this PDF is linearized, but a return value
+    // of false does mean this PDF is not linearized.
+    // FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
+    bool sloppy_is_linearized();
+
+    String parse_comment();
+
+    Value parse_value();
+    Value parse_possible_indirect_value_or_ref();
+    NonnullRefPtr<IndirectValue> parse_indirect_value(int index, int generation);
+    Value parse_number();
+    NonnullRefPtr<NameObject> parse_name();
+    NonnullRefPtr<StringObject> parse_string();
+    String parse_literal_string();
+    String parse_hex_string();
+    NonnullRefPtr<ArrayObject> parse_array();
+    NonnullRefPtr<DictObject> parse_dict();
+    NonnullRefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);
+
+    bool matches_eol() const;
+    bool matches_whitespace() const;
+    bool matches_number() const;
+
+    void consume_eol();
+    bool consume_whitespace();
+    char consume();
+    void consume(char);
+
+    Reader m_reader;
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/Reader.h b/Userland/Libraries/LibPDF/Reader.h
new file mode 100644
index 0000000000..bc32416527
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Reader.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Debug.h>
+#include <AK/Function.h>
+#include <AK/ScopeGuard.h>
+
+namespace PDF {
+
+class Reader {
+public:
+    explicit Reader(const ReadonlyBytes& bytes)
+        : m_bytes(bytes)
+    {
+    }
+
+    ALWAYS_INLINE const ReadonlyBytes& bytes() const { return m_bytes; }
+    ALWAYS_INLINE size_t offset() const { return m_offset; }
+
+    bool done() const
+    {
+        if (m_forwards)
+            return offset() >= bytes().size();
+        return m_offset < 0;
+    }
+
+    size_t remaining() const
+    {
+        if (done())
+            return 0;
+
+        if (m_forwards)
+            return bytes().size() - offset() - 1;
+        return offset() + 1;
+    }
+
+    void move_by(size_t count)
+    {
+        if (m_forwards) {
+            m_offset += static_cast<ssize_t>(count);
+        } else {
+            m_offset -= static_cast<ssize_t>(count);
+        }
+    }
+
+    char read()
+    {
+        auto value = m_bytes.at(m_offset);
+        move_by(1);
+        return static_cast<char>(value);
+    }
+
+    char peek(size_t shift = 0) const
+    {
+        auto offset = m_offset + shift * (m_forwards ? 1 : -1);
+        return static_cast<char>(m_bytes.at(offset));
+    }
+
+    template<typename... T>
+    bool matches_any(T... elements) const
+    {
+        if (done())
+            return false;
+        auto ch = peek();
+        return ((ch == elements) || ...);
+    }
+
+    bool matches(char ch) const
+    {
+        return !done() && peek() == ch;
+    }
+
+    bool matches(const char* chars) const
+    {
+        String string(chars);
+        if (remaining() < string.length())
+            return false;
+
+        if (!m_forwards)
+            string = string.reverse();
+
+        for (size_t i = 0; i < string.length(); i++) {
+            if (peek(i) != string[i])
+                return false;
+        }
+
+        return true;
+    }
+
+    template<typename T = char>
+    void move_to(size_t offset)
+    {
+        VERIFY(offset < m_bytes.size());
+        m_offset = static_cast<ssize_t>(offset);
+    }
+
+    void move_until(char ch)
+    {
+        while (!done() && peek() != ch)
+            move_by(1);
+    }
+
+    void move_until(Function<bool(char)> predicate)
+    {
+        while (!done() && !predicate(peek()))
+            move_by(1);
+    }
+
+    ALWAYS_INLINE void move_while(Function<bool(char)> predicate)
+    {
+        move_until([&predicate](char t) { return !predicate(t); });
+    }
+
+    ALWAYS_INLINE void set_reading_forwards() { m_forwards = true; }
+    ALWAYS_INLINE void set_reading_backwards() { m_forwards = false; }
+
+    ALWAYS_INLINE void save() { m_saved_offsets.append(m_offset); }
+    ALWAYS_INLINE void load() { m_offset = m_saved_offsets.take_last(); }
+    ALWAYS_INLINE void discard() { m_saved_offsets.take_last(); }
+
+    void dump_state()
+    {
+        StringBuilder builder;
+        builder.append("Reader State Dump\n\n");
+
+        size_t from = max(0ul, offset() - 10);
+        size_t to = min(bytes().size() - 1, offset() + 10);
+
+        for (auto i = from; i <= to; i++) {
+            char value = static_cast<char>(bytes().at(i));
+            builder.appendff("{}: '{}' (value={:3d}) ", i, value, static_cast<u8>(value));
+            if (i == offset())
+                builder.appendff(" <<< current location, forwards={}", m_forwards);
+            builder.append('\n');
+        }
+        builder.append('\n');
+
+        auto str = builder.to_string();
+        dbgputstr(str.characters(), str.length());
+    }
+
+private:
+    ReadonlyBytes m_bytes;
+    ssize_t m_offset { 0 };
+    Vector<ssize_t> m_saved_offsets;
+    bool m_forwards { true };
+};
+
+}
diff --git a/Userland/Libraries/LibPDF/XRefTable.h b/Userland/Libraries/LibPDF/XRefTable.h
new file mode 100644
index 0000000000..fe084218a3
--- /dev/null
+++ b/Userland/Libraries/LibPDF/XRefTable.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Format.h>
+
+namespace PDF {
+
+struct XRefEntry {
+    long byte_offset { -1L };
+    u16 generation_number { 0 };
+    bool in_use { false };
+};
+
+struct XRefSection {
+    int starting_index;
+    int count;
+    Vector<XRefEntry> entries;
+};
+
+class XRefTable {
+public:
+    void add_section(const XRefSection& section)
+    {
+        m_entries.ensure_capacity(section.starting_index + section.count);
+
+        for (int i = static_cast<int>(m_entries.size()); i < section.starting_index; i++)
+            m_entries.append(XRefEntry {});
+
+        for (auto& entry : section.entries)
+            m_entries.append(entry);
+    }
+
+    [[nodiscard]] ALWAYS_INLINE bool has_object(size_t index) const
+    {
+        return index < m_entries.size() && m_entries[index].byte_offset != -1;
+    }
+
+    [[nodiscard]] ALWAYS_INLINE long byte_offset_for_object(size_t index) const
+    {
+        VERIFY(has_object(index));
+        return m_entries[index].byte_offset;
+    }
+
+    [[nodiscard]] ALWAYS_INLINE u16 generation_number_for_object(size_t index) const
+    {
+        VERIFY(has_object(index));
+        return m_entries[index].generation_number;
+    }
+
+    [[nodiscard]] ALWAYS_INLINE bool is_object_in_use(size_t index) const
+    {
+        VERIFY(has_object(index));
+        return m_entries[index].in_use;
+    }
+
+private:
+    friend struct AK::Formatter<PDF::XRefTable>;
+
+    Vector<XRefEntry> m_entries;
+};
+
+}
+
+namespace AK {
+
+template<>
+struct Formatter<PDF::XRefEntry> : Formatter<StringView> {
+    void format(FormatBuilder& builder, const PDF::XRefEntry& entry)
+    {
+        Formatter<StringView>::format(builder,
+            String::formatted("XRefEntry {{ offset={} generation={} used={} }}",
+                entry.byte_offset,
+                entry.generation_number,
+                entry.in_use));
+    }
+};
+
+template<>
+struct Formatter<PDF::XRefTable> : Formatter<StringView> {
+    void format(FormatBuilder& format_builder, const PDF::XRefTable& table)
+    {
+        StringBuilder builder;
+        builder.append("XRefTable {");
+        for (auto& entry : table.m_entries)
+            builder.appendff("\n  {}", entry);
+        builder.append("\n}");
+        Formatter<StringView>::format(format_builder, builder.to_string());
+    }
+};
+
+}