summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF/Parser.h
diff options
context:
space:
mode:
authorMatthew Olsson <matthewcolsson@gmail.com>2021-04-30 18:33:13 -0700
committerAndreas Kling <kling@serenityos.org>2021-05-10 10:32:39 +0200
commit72f693e9ed6b106e0760a3b1869f92d3267ddd41 (patch)
treef6123c5cf822299d66896b457dfef72f46d9c8cd /Userland/Libraries/LibPDF/Parser.h
parenta8f5b6aaa3405e4b4eda49888ee9da8572d5473d (diff)
downloadserenity-72f693e9ed6b106e0760a3b1869f92d3267ddd41.zip
LibPDF: Add a basic parser and Document structure
This commit adds a parser as well as the Reader class, which serves as a utility to aid in reading the PDF both forwards and in reverse. The parser currently is capable of reading xref tables, as well as all values. We don't really do anything with any of this information, however.
Diffstat (limited to 'Userland/Libraries/LibPDF/Parser.h')
-rw-r--r--Userland/Libraries/LibPDF/Parser.h72
1 files changed, 72 insertions, 0 deletions
diff --git a/Userland/Libraries/LibPDF/Parser.h b/Userland/Libraries/LibPDF/Parser.h
new file mode 100644
index 0000000000..c983628e49
--- /dev/null
+++ b/Userland/Libraries/LibPDF/Parser.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021, Matthew Olsson <mattco@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/NonnullRefPtrVector.h>
+#include <LibPDF/Object.h>
+#include <LibPDF/Reader.h>
+#include <LibPDF/XRefTable.h>
+
+namespace PDF {
+
+class Document;
+
+class Parser {
+public:
+ Parser(Badge<Document>, const ReadonlyBytes&);
+
+ bool perform_validation();
+
+ struct XRefTableAndTrailer {
+ XRefTable xref_table;
+ NonnullRefPtr<DictObject> trailer;
+ };
+ XRefTableAndTrailer parse_last_xref_table_and_trailer();
+
+private:
+ bool parse_header();
+ XRefTable parse_xref_table();
+ NonnullRefPtr<DictObject> parse_file_trailer();
+
+ bool navigate_to_before_eof_marker();
+ bool navigate_to_after_startxref();
+
+ // If the PDF is linearized, the first object will be the linearization
+ // parameter dictionary, and it will always occur within the first 1024 bytes.
+ // We do a very sloppy and context-free search for this object. A return value
+ // of true does not necessarily mean this PDF is linearized, but a return value
+ // of false does mean this PDF is not linearized.
+ // FIXME: false doesn't guarantee non-linearization, but we VERIFY the result!
+ bool sloppy_is_linearized();
+
+ String parse_comment();
+
+ Value parse_value();
+ Value parse_possible_indirect_value_or_ref();
+ NonnullRefPtr<IndirectValue> parse_indirect_value(int index, int generation);
+ Value parse_number();
+ NonnullRefPtr<NameObject> parse_name();
+ NonnullRefPtr<StringObject> parse_string();
+ String parse_literal_string();
+ String parse_hex_string();
+ NonnullRefPtr<ArrayObject> parse_array();
+ NonnullRefPtr<DictObject> parse_dict();
+ NonnullRefPtr<StreamObject> parse_stream(NonnullRefPtr<DictObject> dict);
+
+ bool matches_eol() const;
+ bool matches_whitespace() const;
+ bool matches_number() const;
+
+ void consume_eol();
+ bool consume_whitespace();
+ char consume();
+ void consume(char);
+
+ Reader m_reader;
+};
+
+}