summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibXML
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2022-03-26 21:32:57 +0430
committerAndreas Kling <kling@serenityos.org>2022-03-28 23:11:48 +0200
commit67357fe984c19b724c7171959c4b1a6101f9047a (patch)
treef8285be3fdbe2ff7f84bdc3e52c015d0bf81c17b /Userland/Libraries/LibXML
parent06cedf5baee5d77b11f2d472a4ba934e4a6cb6c1 (diff)
downloadserenity-67357fe984c19b724c7171959c4b1a6101f9047a.zip
LibXML: Add a fairly basic XML parser
Currently this can parse XML and resolve external resources/references, and read a DTD (but not apply or verify its rules). That's good enough for _most_ XHTML documents as the HTML 5 spec enforces its own rules about document well-formedness, and does not make use of XML DTDs (aside from a list of predefined entities). An accompanying `xml` utility is provided that can read and dump XML documents, and can also run the XML conformance test suite.
Diffstat (limited to 'Userland/Libraries/LibXML')
-rw-r--r--Userland/Libraries/LibXML/CMakeLists.txt7
-rw-r--r--Userland/Libraries/LibXML/DOM/Document.h53
-rw-r--r--Userland/Libraries/LibXML/DOM/DocumentTypeDeclaration.h138
-rw-r--r--Userland/Libraries/LibXML/DOM/Node.cpp54
-rw-r--r--Userland/Libraries/LibXML/DOM/Node.h40
-rw-r--r--Userland/Libraries/LibXML/Forward.h15
-rw-r--r--Userland/Libraries/LibXML/FundamentalTypes.h16
-rw-r--r--Userland/Libraries/LibXML/Parser/Parser.cpp1780
-rw-r--r--Userland/Libraries/LibXML/Parser/Parser.h223
9 files changed, 2326 insertions, 0 deletions
diff --git a/Userland/Libraries/LibXML/CMakeLists.txt b/Userland/Libraries/LibXML/CMakeLists.txt
new file mode 100644
index 0000000000..8ab9576e31
--- /dev/null
+++ b/Userland/Libraries/LibXML/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(SOURCES
+ Parser/Parser.cpp
+ DOM/Node.cpp
+)
+
+serenity_lib(LibXML xml)
+target_link_libraries(LibXML LibC)
diff --git a/Userland/Libraries/LibXML/DOM/Document.h b/Userland/Libraries/LibXML/DOM/Document.h
new file mode 100644
index 0000000000..1b33661a04
--- /dev/null
+++ b/Userland/Libraries/LibXML/DOM/Document.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/HashMap.h>
+#include <AK/NonnullOwnPtr.h>
+#include <LibXML/DOM/DocumentTypeDeclaration.h>
+#include <LibXML/DOM/Node.h>
+#include <LibXML/Forward.h>
+
+namespace XML {
+
+enum class Version {
+ Version10,
+ Version11,
+};
+
+struct Doctype {
+ String type;
+ Vector<MarkupDeclaration> markup_declarations;
+ Optional<ExternalID> external_id;
+};
+
+class Document {
+public:
+ explicit Document(NonnullOwnPtr<Node> root, Optional<Doctype> doctype, HashMap<Name, String> processing_instructions, Version version)
+ : m_root(move(root))
+ , m_processing_instructions(move(processing_instructions))
+ , m_version(version)
+ , m_explicit_doctype(move(doctype))
+ {
+ }
+
+ Node& root() { return *m_root; }
+ Node const& root() const { return *m_root; }
+
+ HashMap<Name, String> const& processing_instructions() const { return m_processing_instructions; }
+
+ Version version() const { return m_version; }
+
+ Optional<Doctype> const& doctype() const { return m_explicit_doctype; }
+
+private:
+ NonnullOwnPtr<Node> m_root;
+ HashMap<Name, String> m_processing_instructions;
+ Version m_version;
+ Optional<Doctype> m_explicit_doctype;
+};
+}
diff --git a/Userland/Libraries/LibXML/DOM/DocumentTypeDeclaration.h b/Userland/Libraries/LibXML/DOM/DocumentTypeDeclaration.h
new file mode 100644
index 0000000000..1a2599d021
--- /dev/null
+++ b/Userland/Libraries/LibXML/DOM/DocumentTypeDeclaration.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/HashTable.h>
+#include <AK/String.h>
+#include <AK/Variant.h>
+#include <AK/Vector.h>
+#include <LibXML/FundamentalTypes.h>
+
+namespace XML {
+
+struct ElementDeclaration {
+ struct Empty {
+ };
+ struct Any {
+ };
+ struct Mixed {
+ HashTable<Name> types;
+ bool many;
+ };
+ struct Children {
+ struct Entry;
+ enum class Qualifier {
+ ExactlyOnce,
+ Optional,
+ Any,
+ OneOrMore,
+ };
+
+ struct Choice {
+ Vector<Entry> entries;
+ Qualifier qualifier;
+ };
+ struct Sequence {
+ Vector<Entry> entries;
+ Qualifier qualifier;
+ };
+
+ struct Entry {
+ Variant<Name, Choice, Sequence> sub_entries;
+ Qualifier qualifier;
+ };
+
+ Variant<Choice, Sequence> contents;
+ Qualifier qualifier;
+ };
+ using ContentSpec = Variant<Empty, Any, Mixed, Children>;
+
+ Name type;
+ ContentSpec content_spec;
+};
+
+struct AttributeListDeclaration {
+ enum class StringType {
+ CData,
+ };
+ enum class TokenizedType {
+ ID,
+ IDRef,
+ IDRefs,
+ Entity,
+ Entities,
+ NMToken,
+ NMTokens,
+ };
+ struct NotationType {
+ HashTable<Name> names;
+ };
+ struct Enumeration {
+ // FIXME: NMToken
+ HashTable<String> tokens;
+ };
+ using Type = Variant<StringType, TokenizedType, NotationType, Enumeration>;
+
+ struct Required {
+ };
+ struct Implied {
+ };
+ struct Fixed {
+ String value;
+ };
+ struct DefaultValue {
+ String value;
+ };
+
+ using Default = Variant<Required, Implied, Fixed, DefaultValue>;
+
+ struct Definition {
+ Name name;
+ Type type;
+ Default default_;
+ };
+ Name type;
+ Vector<Definition> attributes;
+};
+
+struct PublicID {
+ String public_literal;
+};
+
+struct SystemID {
+ String system_literal;
+};
+
+struct ExternalID {
+ Optional<PublicID> public_id;
+ SystemID system_id;
+};
+
+struct EntityDefinition {
+ ExternalID id;
+ Optional<Name> notation;
+};
+
+struct GEDeclaration {
+ Name name;
+ Variant<String, EntityDefinition> definition;
+};
+
+struct PEDeclaration {
+ Name name;
+ Variant<String, ExternalID> definition;
+};
+
+using EntityDeclaration = Variant<GEDeclaration, PEDeclaration>;
+
+struct NotationDeclaration {
+ Name name;
+ Variant<ExternalID, PublicID> notation;
+};
+
+using MarkupDeclaration = Variant<ElementDeclaration, AttributeListDeclaration, EntityDeclaration, NotationDeclaration>;
+}
diff --git a/Userland/Libraries/LibXML/DOM/Node.cpp b/Userland/Libraries/LibXML/DOM/Node.cpp
new file mode 100644
index 0000000000..df7ec1d297
--- /dev/null
+++ b/Userland/Libraries/LibXML/DOM/Node.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/HashMap.h>
+#include <LibXML/DOM/Node.h>
+
+namespace XML {
+
+bool Node::operator==(Node const& other) const
+{
+ return content.visit(
+ [&](Text const& text) -> bool {
+ auto other_text = other.content.get_pointer<Text>();
+ if (!other_text)
+ return false;
+ return text.builder.string_view() == other_text->builder.string_view();
+ },
+ [&](Comment const& comment) -> bool {
+ auto other_comment = other.content.get_pointer<Comment>();
+ if (!other_comment)
+ return false;
+ return comment.text == other_comment->text;
+ },
+ [&](Element const& element) -> bool {
+ auto other_element = other.content.get_pointer<Element>();
+ if (!other_element)
+ return false;
+ if (element.name != other_element->name)
+ return false;
+ if (element.attributes.size() != other_element->attributes.size())
+ return false;
+
+ for (auto& entry : element.attributes) {
+ auto it = other_element->attributes.find(entry.key);
+ if (it == other_element->attributes.end())
+ return false;
+ if (it->value != entry.value)
+ return false;
+ }
+
+ if (element.children.size() != other_element->children.size())
+ return false;
+ for (size_t i = 0; i < element.children.size(); ++i) {
+ if (element.children[i] != other_element->children[i])
+ return false;
+ }
+ return true;
+ });
+}
+
+}
diff --git a/Userland/Libraries/LibXML/DOM/Node.h b/Userland/Libraries/LibXML/DOM/Node.h
new file mode 100644
index 0000000000..1394538a95
--- /dev/null
+++ b/Userland/Libraries/LibXML/DOM/Node.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/NonnullOwnPtrVector.h>
+#include <AK/String.h>
+#include <AK/Variant.h>
+#include <AK/Vector.h>
+#include <LibXML/FundamentalTypes.h>
+
+namespace XML {
+
+struct Attribute {
+ Name name;
+ String value;
+};
+
+struct Node {
+ struct Text {
+ StringBuilder builder;
+ };
+ struct Comment {
+ String text;
+ };
+ struct Element {
+ Name name;
+ HashMap<Name, String> attributes;
+ NonnullOwnPtrVector<Node> children;
+ };
+
+ bool operator==(Node const&) const;
+
+ Variant<Text, Comment, Element> content;
+ Node* parent { nullptr };
+};
+}
diff --git a/Userland/Libraries/LibXML/Forward.h b/Userland/Libraries/LibXML/Forward.h
new file mode 100644
index 0000000000..7cc6e764de
--- /dev/null
+++ b/Userland/Libraries/LibXML/Forward.h
@@ -0,0 +1,15 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+namespace XML {
+class Parser;
+class Document;
+struct Node;
+struct Attribute;
+struct Listener;
+}
diff --git a/Userland/Libraries/LibXML/FundamentalTypes.h b/Userland/Libraries/LibXML/FundamentalTypes.h
new file mode 100644
index 0000000000..e1900f091a
--- /dev/null
+++ b/Userland/Libraries/LibXML/FundamentalTypes.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/String.h>
+
+namespace XML {
+
+// FIXME: Maybe extend this to something more sophisticated?
+using Name = String;
+
+}
diff --git a/Userland/Libraries/LibXML/Parser/Parser.cpp b/Userland/Libraries/LibXML/Parser/Parser.cpp
new file mode 100644
index 0000000000..0940d76fab
--- /dev/null
+++ b/Userland/Libraries/LibXML/Parser/Parser.cpp
@@ -0,0 +1,1780 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <LibXML/DOM/Document.h>
+#include <LibXML/Parser/Parser.h>
+
+struct Range {
+ consteval Range(u32 start, u32 end)
+ : start(start)
+ , end(end)
+ {
+ }
+
+ u32 start;
+ u32 end;
+};
+
+template<auto... ranges>
+struct ranges_for_search {
+ auto contains(u32 value) const
+ {
+ return ((value >= ranges.start && value <= ranges.end) || ...);
+ }
+
+ bool operator()(u32 value) const
+ {
+ return contains(value);
+ }
+
+ template<auto... ranges_to_include>
+ consteval auto with() const
+ {
+ return ranges_for_search<ranges..., ranges_to_include...>();
+ }
+
+ template<auto... ranges_to_include>
+ consteval auto unify(ranges_for_search<ranges_to_include...> const&) const
+ {
+ return ranges_for_search<ranges..., ranges_to_include...>();
+ }
+};
+
+template<size_t Count, typename Element>
+struct StringSet {
+ consteval StringSet(Element const (&entries)[Count])
+ {
+ for (size_t i = 0; i < Count - 1; ++i)
+ elements[i] = entries[i];
+ }
+
+ consteval auto operator[](size_t i) const { return elements[i]; }
+
+ Element elements[Count - 1];
+};
+
+template<StringSet chars>
+consteval static auto set_to_search()
+{
+ return ([&]<auto... Ix>(IndexSequence<Ix...>) {
+ return ranges_for_search<Range(chars[Ix], chars[Ix])...>();
+ }(MakeIndexSequence<array_size(chars.elements)>()));
+}
+
+namespace XML {
+
+size_t Parser::s_debug_indent_level { 0 };
+
+void Parser::append_node(NonnullOwnPtr<Node> node)
+{
+ if (m_entered_node) {
+ m_entered_node->content.get<Node::Element>().children.append(move(node));
+ } else {
+ m_root_node = move(node);
+ m_entered_node = m_root_node.ptr();
+ }
+}
+
+void Parser::append_text(String text)
+{
+ if (m_listener) {
+ m_listener->text(text);
+ return;
+ }
+
+ if (!m_entered_node) {
+ Node::Text node;
+ node.builder.append(text);
+ m_root_node = make<Node>(move(node));
+ return;
+ }
+
+ m_entered_node->content.visit(
+ [&](Node::Element& node) {
+ if (!node.children.is_empty()) {
+ auto* text_node = node.children.last().content.get_pointer<Node::Text>();
+ if (text_node) {
+ text_node->builder.append(text);
+ return;
+ }
+ }
+ Node::Text text_node;
+ text_node.builder.append(text);
+ node.children.append(make<Node>(move(text_node)));
+ },
+ [&](auto&) {
+ // Can't enter a text or comment node.
+ VERIFY_NOT_REACHED();
+ });
+}
+
+void Parser::append_comment(String text)
+{
+ if (m_listener) {
+ m_listener->comment(text);
+ return;
+ }
+
+ // If there's no node to attach this to, drop it on the floor.
+ // This can happen to comments in the prolog.
+ if (!m_entered_node)
+ return;
+
+ m_entered_node->content.visit(
+ [&](Node::Element& node) {
+ node.children.append(make<Node>(Node::Comment { move(text) }));
+ },
+ [&](auto&) {
+ // Can't enter a text or comment node.
+ VERIFY_NOT_REACHED();
+ });
+}
+
+void Parser::enter_node(Node& node)
+{
+ if (m_listener) {
+ auto& element = node.content.get<Node::Element>();
+ m_listener->element_start(element.name, element.attributes);
+ }
+
+ if (&node != m_root_node.ptr())
+ node.parent = m_entered_node;
+ m_entered_node = &node;
+}
+
+void Parser::leave_node()
+{
+ if (m_listener) {
+ auto& element = m_entered_node->content.get<Node::Element>();
+ m_listener->element_end(element.name);
+ }
+
+ m_entered_node = m_entered_node->parent;
+}
+
+ErrorOr<Document, ParseError> Parser::parse()
+{
+ if (auto result = parse_internal(); result.is_error()) {
+ if (m_parse_errors.is_empty())
+ return result.release_error();
+ return m_parse_errors.take_first();
+ }
+ return Document {
+ m_root_node.release_nonnull(),
+ move(m_doctype),
+ move(m_processing_instructions),
+ m_version,
+ };
+}
+
+ErrorOr<void, ParseError> Parser::parse_with_listener(Listener& listener)
+{
+ m_listener = &listener;
+ ScopeGuard unset_listener { [this] { m_listener = nullptr; } };
+ m_listener->document_start();
+ auto result = parse_internal();
+ if (result.is_error())
+ m_listener->error(result.error());
+ m_listener->document_end();
+ m_root_node.clear();
+ return result;
+}
+
+// 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S
+ErrorOr<void, ParseError> Parser::skip_whitespace(Required required)
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // S ::= (#x20 | #x9 | #xD | #xA)+
+ auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"));
+ if (required == Required::Yes && matched.is_empty())
+ return parse_error(m_lexer.tell(), "Expected whitespace");
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar
+constexpr static auto s_restricted_characters = ranges_for_search<Range(0x1, 0x8), Range(0xb, 0xc), Range(0xe, 0x1f), Range(0x7f, 0x84), Range(0x86, 0x9f)>();
+
+// 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed
+ErrorOr<void, ParseError> Parser::parse_internal()
+{
+ auto rule = enter_rule();
+
+ // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* )
+ TRY(parse_prolog());
+ TRY(parse_element());
+ while (true) {
+ if (auto result = parse_misc(); result.is_error())
+ break;
+ }
+
+ auto matched_source = m_source.substring_view(0, m_lexer.tell());
+ if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) {
+ return parse_error(
+ it.index(),
+ String::formatted("Invalid character #{:x} used in document", *it));
+ }
+
+ if (!m_lexer.is_eof())
+ return parse_error(m_lexer.tell(), "Garbage after document");
+
+ return {};
+}
+
+ErrorOr<void, ParseError> Parser::expect(StringView expected)
+{
+ auto rollback = rollback_point();
+
+ if (!m_lexer.consume_specific(expected)) {
+ if (m_options.treat_errors_as_fatal)
+ return parse_error(m_lexer.tell(), String::formatted("Expected '{}'", expected));
+ }
+
+ rollback.disarm();
+ return {};
+}
+
+template<typename Pred>
+requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> Parser::expect(Pred predicate, StringView description)
+{
+ auto rollback = rollback_point();
+ auto start = m_lexer.tell();
+ if (!m_lexer.next_is(predicate)) {
+ if (m_options.treat_errors_as_fatal)
+ return parse_error(m_lexer.tell(), String::formatted("Expected {}", description));
+ }
+
+ m_lexer.ignore();
+ rollback.disarm();
+ return m_source.substring_view(start, m_lexer.tell() - start);
+}
+
+template<typename Pred>
+requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> Parser::expect_many(Pred predicate, StringView description)
+{
+ auto rollback = rollback_point();
+ auto start = m_lexer.tell();
+ while (m_lexer.next_is(predicate)) {
+ if (m_lexer.is_eof())
+ break;
+ m_lexer.ignore();
+ }
+
+ if (m_lexer.tell() == start) {
+ if (m_options.treat_errors_as_fatal) {
+ return parse_error(m_lexer.tell(), String::formatted("Expected {}", description));
+ }
+ }
+
+ rollback.disarm();
+ return m_source.substring_view(start, m_lexer.tell() - start);
+}
+
+// 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog
+ErrorOr<void, ParseError> Parser::parse_prolog()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // prolog ::= XMLDecl Misc* (doctypedecl Misc*)?
+ // The following is valid in XML 1.0.
+ // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
+ if (auto result = parse_xml_decl(); result.is_error()) {
+ m_version = Version::Version10;
+ m_in_compatibility_mode = true;
+ }
+ auto accept = accept_rule();
+
+ while (true) {
+ if (auto result = parse_misc(); result.is_error())
+ break;
+ }
+
+ if (auto result = parse_doctype_decl(); !result.is_error()) {
+ while (true) {
+ if (auto result = parse_misc(); result.is_error())
+ break;
+ }
+ }
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl
+ErrorOr<void, ParseError> Parser::parse_xml_decl()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // XMLDecl::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
+
+ TRY(expect("<?xml"));
+ auto accept = accept_rule();
+
+ TRY(parse_version_info());
+ (void)parse_encoding_decl();
+ (void)parse_standalone_document_decl();
+ TRY(skip_whitespace());
+ TRY(expect("?>"));
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo
+ErrorOr<void, ParseError> Parser::parse_version_info()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"')
+ TRY(skip_whitespace(Required::Yes));
+ TRY(expect("version"));
+ auto accept = accept_rule();
+
+ TRY(parse_eq());
+ TRY(expect(is_any_of("'\""), "one of ' or \""));
+ m_lexer.retreat();
+
+ auto version_string = m_lexer.consume_quoted_string();
+ if (version_string == "1.0") {
+ // FIXME: Compatibility mode, figure out which rules are different in XML 1.0.
+ m_version = Version::Version10;
+ m_in_compatibility_mode = true;
+ } else {
+ if (version_string != "1.1" && m_options.treat_errors_as_fatal)
+ return parse_error(m_lexer.tell(), String::formatted("Expected '1.1', found '{}'", version_string));
+ }
+
+ m_version = Version::Version11;
+ rollback.disarm();
+ return {};
+}
+
+// 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq
+ErrorOr<void, ParseError> Parser::parse_eq()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // Eq ::= S? '=' S?
+ auto accept = accept_rule();
+ TRY(skip_whitespace());
+ TRY(expect("="));
+ TRY(skip_whitespace());
+ rollback.disarm();
+ return {};
+}
+
+// 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl
+ErrorOr<void, ParseError> Parser::parse_encoding_decl()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" )
+ TRY(skip_whitespace(Required::Yes));
+ TRY(expect("encoding"));
+ auto accept = accept_rule();
+
+ TRY(parse_eq());
+ TRY(expect(is_any_of("'\""), "one of ' or \""));
+ m_lexer.retreat();
+
+ // FIXME: Actually do something with this encoding.
+ m_encoding = m_lexer.consume_quoted_string();
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd
+ErrorOr<void, ParseError> Parser::parse_standalone_document_decl()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"'))
+ TRY(skip_whitespace(Required::Yes));
+ TRY(expect("standalone"));
+ auto accept = accept_rule();
+
+ TRY(expect(is_any_of("'\""), "one of ' or \""));
+ m_lexer.retreat();
+
+ auto value = m_lexer.consume_quoted_string();
+ if (!value.is_one_of("yes", "no"))
+ return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'");
+
+ m_standalone = value == "yes";
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc
+ErrorOr<void, ParseError> Parser::parse_misc()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // Misc ::= Comment | PI | S
+ if (auto result = parse_comment(); !result.is_error()) {
+ rollback.disarm();
+ return {};
+ }
+
+ if (auto result = parse_processing_instruction(); !result.is_error()) {
+ rollback.disarm();
+ return {};
+ }
+
+ if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
+ rollback.disarm();
+ return {};
+ }
+
+ return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none");
+}
+
+// 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment
+ErrorOr<void, ParseError> Parser::parse_comment()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
+ TRY(expect("<!--"));
+ auto accept = accept_rule();
+
+ bool last_seen_a_dash = false;
+ // FIXME: This should disallow surrogate blocks
+ auto text = m_lexer.consume_while([&](auto ch) {
+ if (ch != '-') {
+ last_seen_a_dash = false;
+ return true;
+ }
+
+ if (last_seen_a_dash)
+ return false;
+
+ last_seen_a_dash = true;
+ return true;
+ });
+
+ if (last_seen_a_dash) {
+ m_lexer.retreat();
+ text = text.substring_view(0, text.length() - 1);
+ }
+
+ TRY(expect("-->"));
+
+ if (m_options.preserve_comments)
+ append_comment(text);
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI
+ErrorOr<void, ParseError> Parser::parse_processing_instruction()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
+ TRY(expect("<?"));
+ auto accept = accept_rule();
+
+ auto target = TRY(parse_processing_instruction_target());
+ String data;
+ if (auto result = skip_whitespace(Required::Yes); !result.is_error())
+ data = m_lexer.consume_until("?>");
+ TRY(expect("?>"));
+
+ m_processing_instructions.set(target, data);
+ rollback.disarm();
+ return {};
+}
+
+// 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget
+ErrorOr<Name, ParseError> Parser::parse_processing_instruction_target()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
+ auto target = TRY(parse_name());
+ auto accept = accept_rule();
+
+ if (target.equals_ignoring_case("xml") && m_options.treat_errors_as_fatal) {
+ return parse_error(
+ m_lexer.tell() - target.length(),
+ "Use of the reserved 'xml' name for processing instruction target name is disallowed");
+ }
+
+ rollback.disarm();
+ return target;
+}
+
+// NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
+constexpr static auto s_name_start_characters = ranges_for_search<Range(':', ':'), Range('A', 'Z'), Range('_', '_'), Range('a', 'z'), Range(0xc0, 0xd6), Range(0xd8, 0xf6), Range(0xf8, 0x2ff), Range(0x370, 0x37d), Range(0x37f, 0x1fff), Range(0x200c, 0x200d), Range(0x2070, 0x218f), Range(0x2c00, 0x2fef), Range(0x3001, 0xd7ff), Range(0xf900, 0xfdcf), Range(0xfdf0, 0xfffd), Range(0x10000, 0xeffff)> {};
+
+// NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
+constexpr static auto s_name_characters = s_name_start_characters.with<Range('-', '-'), Range('.', '.'), Range('0', '9'), Range(0xb7, 0xb7), Range(0x0300, 0x036f), Range(0x203f, 0x2040)>();
+
+// 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name
+ErrorOr<Name, ParseError> Parser::parse_name()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // Name ::= NameStartChar (NameChar)*
+ auto start = TRY(expect(s_name_start_characters, "a NameStartChar"));
+ auto accept = accept_rule();
+
+ auto rest = m_lexer.consume_while(s_name_characters);
+ StringBuilder builder;
+ builder.append(start);
+ builder.append(rest);
+
+ rollback.disarm();
+ return builder.to_string();
+}
+
+// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
+ErrorOr<void, ParseError> Parser::parse_doctype_decl()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Doctype doctype;
+
+ // doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
+ TRY(expect("<!DOCTYPE"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ doctype.type = TRY(parse_name());
+ if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
+ auto id_start = m_lexer.tell();
+ if (auto id_result = parse_external_id(); !id_result.is_error()) {
+ doctype.external_id = id_result.release_value();
+ if (m_options.resolve_external_resource) {
+ auto resource_result = m_options.resolve_external_resource(doctype.external_id->system_id, doctype.external_id->public_id);
+ if (resource_result.is_error()) {
+ return parse_error(
+ id_start,
+ String::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error()));
+ }
+ StringView resolved_source = resource_result.value();
+ TemporaryChange source { m_source, resolved_source };
+ TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
+ auto declarations = TRY(parse_external_subset());
+ if (!m_lexer.is_eof()) {
+ return parse_error(
+ m_lexer.tell(),
+ String::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal));
+ }
+ doctype.markup_declarations.extend(move(declarations));
+ }
+ }
+ }
+ TRY(skip_whitespace(Required::No));
+ if (m_lexer.consume_specific('[')) {
+ auto internal_subset = TRY(parse_internal_subset());
+ TRY(expect("]"));
+ TRY(skip_whitespace());
+ doctype.markup_declarations.extend(internal_subset);
+ }
+
+ TRY(expect(">"));
+
+ rollback.disarm();
+ m_doctype = move(doctype);
+ return {};
+}
+
+// 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element
+ErrorOr<void, ParseError> Parser::parse_element()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // element ::= EmptyElemTag
+ // | STag content ETag
+ if (auto result = parse_empty_element_tag(); !result.is_error()) {
+ append_node(result.release_value());
+ rollback.disarm();
+ return {};
+ }
+
+ auto start_tag = TRY(parse_start_tag());
+ auto& node = *start_tag;
+ auto& tag = node.content.get<Node::Element>();
+ append_node(move(start_tag));
+ enter_node(node);
+ ScopeGuard quit {
+ [&] {
+ leave_node();
+ }
+ };
+
+ TRY(parse_content());
+
+ auto tag_location = m_lexer.tell();
+ auto closing_name = TRY(parse_end_tag());
+
+ // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag.
+ if (m_options.treat_errors_as_fatal && closing_name != tag.name)
+ return parse_error(tag_location, "Invalid closing tag");
+
+ rollback.disarm();
+ return {};
+}
+
+// 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag
+ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_empty_element_tag()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
+ TRY(expect("<"));
+ auto accept = accept_rule();
+
+ auto name = TRY(parse_name());
+ HashMap<Name, String> attributes;
+
+ while (true) {
+ if (auto result = skip_whitespace(Required::Yes); result.is_error())
+ break;
+
+ if (auto result = parse_attribute(); !result.is_error()) {
+ auto attribute = result.release_value();
+ attributes.set(move(attribute.name), move(attribute.value));
+ } else {
+ break;
+ }
+ }
+
+ TRY(skip_whitespace());
+ TRY(expect("/>"));
+
+ rollback.disarm();
+ return make<Node>(Node::Element { move(name), move(attributes), {} });
+}
+
+// 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute
+ErrorOr<Attribute, ParseError> Parser::parse_attribute()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // Attribute ::= Name Eq AttValue
+ auto name = TRY(parse_name());
+ auto accept = accept_rule();
+
+ TRY(parse_eq());
+ auto value = TRY(parse_attribute_value());
+
+ rollback.disarm();
+ return Attribute {
+ move(name),
+ move(value),
+ };
+}
+
+// 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue
+ErrorOr<String, ParseError> Parser::parse_attribute_value()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // AttValue ::= '"' ([^<&"] | Reference)* '"'
+ // | "'" ([^<&'] | Reference)* "'"
+ auto quote = TRY(expect(is_any_of("'\""), "one of ' or \""));
+ auto accept = accept_rule();
+
+ auto text = TRY(parse_attribute_value_inner(quote));
+ TRY(expect(quote));
+
+ rollback.disarm();
+ return text;
+}
+
+ErrorOr<String, ParseError> Parser::parse_attribute_value_inner(StringView disallow)
+{
+ StringBuilder builder;
+ while (true) {
+ if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof())
+ break;
+
+ if (m_lexer.next_is('<')) {
+ // Not allowed, return a nice error to make it easier to debug.
+ return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values");
+ }
+
+ if (m_lexer.next_is('&')) {
+ auto reference = TRY(parse_reference());
+ if (auto* char_reference = reference.get_pointer<String>())
+ builder.append(*char_reference);
+ else
+ builder.append(TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::AttributeValue)));
+ } else {
+ builder.append(m_lexer.consume());
+ }
+ }
+ return builder.to_string();
+}
+
+// Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
+constexpr static auto s_characters = ranges_for_search<Range(0x1, 0xd7ff), Range(0xe000, 0xfffd), Range(0x10000, 0x10ffff)>();
+
+// 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference
+ErrorOr<Variant<Parser::EntityReference, String>, ParseError> Parser::parse_reference()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ // Reference ::= EntityRef | CharRef
+
+ // 4.1.68. EntityRef
+ // EntityRef ::= '&' Name ';'
+
+ // 4.1.66. CharRef
+ // CharRef ::= '&#' [0-9]+ ';'
+ // | '&#x' [0-9a-fA-F]+ ';'
+
+ auto reference_start = m_lexer.tell();
+ TRY(expect("&"));
+ auto accept = accept_rule();
+
+ auto name_result = parse_name();
+ if (name_result.is_error()) {
+ TRY(expect("#"));
+ u32 code_point;
+ if (m_lexer.consume_specific('x')) {
+ auto hex = TRY(expect_many(
+ ranges_for_search<Range('0', '9'), Range('a', 'f'), Range('A', 'F')>(),
+ "any of [0-9a-fA-F]"));
+ code_point = *AK::StringUtils::convert_to_uint_from_hex<u32>(hex);
+ } else {
+ auto decimal = TRY(expect_many(
+ ranges_for_search<Range('0', '9')>(),
+ "any of [0-9]"));
+ code_point = *decimal.to_uint<u32>();
+ }
+
+ if (!s_characters.contains(code_point))
+ return parse_error(reference_start, "Invalid character reference");
+
+ TRY(expect(";"));
+
+ StringBuilder builder;
+ builder.append_code_point(code_point);
+
+ rollback.disarm();
+ return builder.to_string();
+ }
+
+ auto name = name_result.release_value();
+ TRY(expect(";"));
+
+ rollback.disarm();
+ return EntityReference { move(name) };
+}
+
+// 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag
+ErrorOr<NonnullOwnPtr<Node>, ParseError> Parser::parse_start_tag()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // STag ::= '<' Name (S Attribute)* S? '>'
+ TRY(expect("<"));
+ auto accept = accept_rule();
+
+ auto name = TRY(parse_name());
+ HashMap<Name, String> attributes;
+
+ while (true) {
+ if (auto result = skip_whitespace(Required::Yes); result.is_error())
+ break;
+
+ if (auto result = parse_attribute(); !result.is_error()) {
+ auto attribute = result.release_value();
+ attributes.set(move(attribute.name), move(attribute.value));
+ } else {
+ break;
+ }
+ }
+
+ TRY(skip_whitespace());
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return make<Node>(Node::Element { move(name), move(attributes), {} });
+}
+
+// 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag
+ErrorOr<Name, ParseError> Parser::parse_end_tag()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // ETag ::= '</' Name S? '>'
+ TRY(expect("</"));
+ auto accept = accept_rule();
+
+ auto name = TRY(parse_name());
+ TRY(skip_whitespace());
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return name;
+}
+
+// 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content
+ErrorOr<void, ParseError> Parser::parse_content()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
+ if (auto result = parse_char_data(); !result.is_error())
+ append_text(result.release_value());
+
+ while (true) {
+ if (auto result = parse_element(); !result.is_error())
+ goto try_char_data;
+ if (auto result = parse_reference(); !result.is_error()) {
+ auto reference = result.release_value();
+ if (auto char_reference = reference.get_pointer<String>())
+ append_text(*char_reference);
+ else
+ TRY(resolve_reference(reference.get<EntityReference>(), ReferencePlacement::Content));
+ goto try_char_data;
+ }
+ if (auto result = parse_cdata_section(); !result.is_error()) {
+ if (m_options.preserve_cdata)
+ append_text(result.release_value());
+ goto try_char_data;
+ }
+ if (auto result = parse_processing_instruction(); !result.is_error())
+ goto try_char_data;
+ if (auto result = parse_comment(); !result.is_error())
+ goto try_char_data;
+
+ break;
+
+ try_char_data:;
+ if (auto result = parse_char_data(); !result.is_error())
+ append_text(result.release_value());
+ }
+
+ rollback.disarm();
+ return {};
+}
+
+// 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData
+ErrorOr<StringView, ParseError> Parser::parse_char_data()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
+ auto cend_state = 0; // 1: ], 2: ], 3: >
+ auto text = m_lexer.consume_while([&](auto ch) {
+ if (ch == '<' || ch == '&')
+ return false;
+ switch (cend_state) {
+ case 0:
+ case 1:
+ if (ch == ']')
+ cend_state++;
+ else
+ cend_state = 0;
+ return true;
+ case 2:
+ if (ch == '>') {
+ cend_state++;
+ return false;
+ }
+ cend_state = 0;
+ return true;
+ default:
+ VERIFY_NOT_REACHED();
+ }
+ });
+ if (cend_state == 3) {
+ m_lexer.retreat(3);
+ text = text.substring_view(0, text.length() - 3);
+ }
+
+ rollback.disarm();
+ return text;
+}
+
+// 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset
+ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_internal_subset()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Vector<MarkupDeclaration> declarations;
+
+ // intSubset ::= (markupdecl | DeclSep)*
+ while (true) {
+ if (auto result = parse_markup_declaration(); !result.is_error()) {
+ auto maybe_declaration = result.release_value();
+ if (maybe_declaration.has_value())
+ declarations.append(maybe_declaration.release_value());
+ continue;
+ }
+ if (auto result = parse_declaration_separator(); !result.is_error()) {
+ // The markup declarations may be made up in whole or in part of the replacement text of parameter entities.
+ // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl.
+ auto maybe_replacement_text = result.release_value();
+ if (maybe_replacement_text.has_value()) {
+ TemporaryChange<StringView> source { m_source, maybe_replacement_text.value() };
+ TemporaryChange lexer { m_lexer, GenericLexer { m_source } };
+
+ auto contained_declarations = TRY(parse_external_subset_declaration());
+ declarations.extend(move(contained_declarations));
+ }
+ continue;
+ }
+ break;
+ }
+
+ rollback.disarm();
+ return declarations;
+}
+
+// 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl
+ErrorOr<Optional<MarkupDeclaration>, ParseError> Parser::parse_markup_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
+ if (auto result = parse_element_declaration(); !result.is_error()) {
+ rollback.disarm();
+ return MarkupDeclaration { result.release_value() };
+ }
+ if (auto result = parse_attribute_list_declaration(); !result.is_error()) {
+ rollback.disarm();
+ return MarkupDeclaration { result.release_value() };
+ }
+ if (auto result = parse_entity_declaration(); !result.is_error()) {
+ rollback.disarm();
+ return MarkupDeclaration { result.release_value() };
+ }
+ if (auto result = parse_notation_declaration(); !result.is_error()) {
+ rollback.disarm();
+ return MarkupDeclaration { result.release_value() };
+ }
+ if (auto result = parse_processing_instruction(); !result.is_error()) {
+ rollback.disarm();
+ return Optional<MarkupDeclaration> {};
+ }
+ if (auto result = parse_comment(); !result.is_error()) {
+ rollback.disarm();
+ return Optional<MarkupDeclaration> {};
+ }
+
+ return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment");
+}
+
+// 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep
+ErrorOr<Optional<String>, ParseError> Parser::parse_declaration_separator()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // DeclSep ::= PEReference | S
+ if (auto name = parse_parameter_entity_reference(); !name.is_error()) {
+ rollback.disarm();
+ // FIXME: Resolve this PEReference.
+ return "";
+ }
+
+ if (auto result = skip_whitespace(Required::Yes); !result.is_error()) {
+ rollback.disarm();
+ return Optional<String> {};
+ }
+
+ return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference");
+}
+
+// 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference
+ErrorOr<Name, ParseError> Parser::parse_parameter_entity_reference()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // PEReference ::= '%' Name ';'
+ TRY(expect("%"));
+ auto accept = accept_rule();
+
+ auto name = TRY(parse_name());
+ TRY(expect(";"));
+
+ rollback.disarm();
+ return name;
+}
+
+// 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl
+ErrorOr<ElementDeclaration, ParseError> Parser::parse_element_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences,
+ // but the grammar does not allow that, figure this out.
+ // elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
+ TRY(expect("<!ELEMENT"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ auto name = TRY(parse_name());
+ TRY(skip_whitespace(Required::Yes));
+ auto spec = TRY(parse_content_spec());
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return ElementDeclaration {
+ move(name),
+ move(spec),
+ };
+}
+
+// 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl
+ErrorOr<AttributeListDeclaration, ParseError> Parser::parse_attribute_list_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ AttributeListDeclaration declaration;
+
+ // AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
+ TRY(expect("<!ATTLIST"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ declaration.type = TRY(parse_name());
+
+ while (true) {
+ if (auto result = parse_attribute_definition(); !result.is_error())
+ declaration.attributes.append(result.release_value());
+ else
+ break;
+ }
+
+ TRY(skip_whitespace());
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return declaration;
+}
+
+// 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef
+ErrorOr<AttributeListDeclaration::Definition, ParseError> Parser::parse_attribute_definition()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Optional<AttributeListDeclaration::Type> type;
+ Optional<AttributeListDeclaration::Default> default_;
+
+ // AttDef ::= S Name S AttType S DefaultDecl
+ TRY(skip_whitespace(Required::Yes));
+ auto name = TRY(parse_name());
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+
+ // AttType ::= StringType | TokenizedType | EnumeratedType
+ // StringType ::= 'CDATA'
+ // TokenizedType ::= 'ID'
+ // | 'IDREF'
+ // | 'IDREFS'
+ // | 'ENTITY'
+ // | 'ENTITIES'
+ // | 'NMTOKEN'
+ // | 'NMTOKENS'
+ // EnumeratedType ::= NotationType | Enumeration
+ // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')'
+ // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
+ if (m_lexer.consume_specific("CDATA")) {
+ type = AttributeListDeclaration::StringType::CData;
+ } else if (m_lexer.consume_specific("IDREFS")) {
+ type = AttributeListDeclaration::TokenizedType::IDRefs;
+ } else if (m_lexer.consume_specific("IDREF")) {
+ type = AttributeListDeclaration::TokenizedType::IDRef;
+ } else if (m_lexer.consume_specific("ID")) {
+ type = AttributeListDeclaration::TokenizedType::ID;
+ } else if (m_lexer.consume_specific("ENTITIES")) {
+ type = AttributeListDeclaration::TokenizedType::Entities;
+ } else if (m_lexer.consume_specific("ENTITY")) {
+ type = AttributeListDeclaration::TokenizedType::Entity;
+ } else if (m_lexer.consume_specific("NMTOKENS")) {
+ type = AttributeListDeclaration::TokenizedType::NMTokens;
+ } else if (m_lexer.consume_specific("NMTOKEN")) {
+ type = AttributeListDeclaration::TokenizedType::NMToken;
+ } else if (m_lexer.consume_specific("NOTATION")) {
+ HashTable<Name> names;
+ TRY(skip_whitespace(Required::Yes));
+ TRY(expect("("));
+ TRY(skip_whitespace());
+ names.set(TRY(parse_name()));
+ while (true) {
+ TRY(skip_whitespace());
+ if (auto result = expect("|"); result.is_error())
+ break;
+ TRY(skip_whitespace());
+ names.set(TRY(parse_name()));
+ }
+ TRY(skip_whitespace());
+ TRY(expect(")"));
+ type = AttributeListDeclaration::NotationType { move(names) };
+ } else {
+ HashTable<String> names;
+ TRY(expect("("));
+ TRY(skip_whitespace());
+ names.set(TRY(parse_nm_token()));
+ while (true) {
+ TRY(skip_whitespace());
+ if (auto result = expect("|"); result.is_error())
+ break;
+ TRY(skip_whitespace());
+ names.set(TRY(parse_nm_token()));
+ }
+ TRY(skip_whitespace());
+ TRY(expect(")"));
+ type = AttributeListDeclaration::Enumeration { move(names) };
+ }
+
+ TRY(skip_whitespace(Required::Yes));
+
+ // DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
+ // | (('#FIXED' S)? AttValue)
+ if (m_lexer.consume_specific("#REQUIRED")) {
+ default_ = AttributeListDeclaration::Required {};
+ } else if (m_lexer.consume_specific("#IMPLIED")) {
+ default_ = AttributeListDeclaration::Implied {};
+ } else {
+ bool fixed = false;
+ if (m_lexer.consume_specific("#FIXED")) {
+ TRY(skip_whitespace(Required::Yes));
+ fixed = true;
+ }
+ auto value = TRY(parse_attribute_value());
+ if (fixed)
+ default_ = AttributeListDeclaration::Fixed { move(value) };
+ else
+ default_ = AttributeListDeclaration::DefaultValue { move(value) };
+ }
+
+ rollback.disarm();
+ return AttributeListDeclaration::Definition {
+ move(name),
+ type.release_value(),
+ default_.release_value(),
+ };
+}
+
+// 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken
+ErrorOr<StringView, ParseError> Parser::parse_nm_token()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // Nmtoken ::= (NameChar)+
+ auto token = TRY(expect_many(s_name_characters, "a NameChar"));
+
+ rollback.disarm();
+ return token;
+}
+
+// 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations
+ErrorOr<NotationDeclaration, ParseError> Parser::parse_notation_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Variant<ExternalID, PublicID, Empty> notation;
+
+ // NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
+ TRY(expect("<!NOTATION"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ auto name = TRY(parse_name());
+ TRY(skip_whitespace(Required::Yes));
+
+ if (auto result = parse_external_id(); !result.is_error())
+ notation = result.release_value();
+ else
+ notation = TRY(parse_public_id());
+
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return NotationDeclaration {
+ move(name),
+ move(notation).downcast<ExternalID, PublicID>(),
+ };
+}
+
+// 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec
+ErrorOr<ElementDeclaration::ContentSpec, ParseError> Parser::parse_content_spec()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Optional<ElementDeclaration::ContentSpec> content_spec;
+
+ // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children
+ if (m_lexer.consume_specific("EMPTY")) {
+ content_spec = ElementDeclaration::Empty {};
+ } else if (m_lexer.consume_specific("ANY")) {
+ content_spec = ElementDeclaration::Any {};
+ } else {
+ TRY(expect("("));
+ TRY(skip_whitespace());
+ if (m_lexer.consume_specific("#PCDATA")) {
+ HashTable<Name> names;
+ // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*'
+ // | '(' S? '#PCDATA' S? ')'
+ TRY(skip_whitespace());
+ if (m_lexer.consume_specific(")*")) {
+ content_spec = ElementDeclaration::Mixed { .types = {}, .many = true };
+ } else if (m_lexer.consume_specific(')')) {
+ content_spec = ElementDeclaration::Mixed { .types = {}, .many = false };
+ } else {
+ while (true) {
+ TRY(skip_whitespace());
+ if (!m_lexer.consume_specific('|'))
+ break;
+ TRY(skip_whitespace());
+ if (auto result = parse_name(); !result.is_error())
+ names.set(result.release_value());
+ else
+ return parse_error(m_lexer.tell(), "Expected a Name");
+ }
+ TRY(skip_whitespace());
+ TRY(expect(")*"));
+ content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true };
+ }
+ } else {
+ while (!m_lexer.next_is('('))
+ m_lexer.retreat();
+ // children ::= (choice | seq) ('?' | '*' | '+')?
+ // cp ::= (Name | choice | seq) ('?' | '*' | '+')?
+ // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')'
+ // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')'
+ Function<ErrorOr<ElementDeclaration::Children::Choice, ParseError>()> parse_choice;
+ Function<ErrorOr<ElementDeclaration::Children::Sequence, ParseError>()> parse_sequence;
+
+ auto parse_cp_init = [&]() -> ErrorOr<Variant<Name, ElementDeclaration::Children::Choice, ElementDeclaration::Children::Sequence>, ParseError> {
+ if (auto result = parse_name(); !result.is_error())
+ return result.release_value();
+ if (auto result = parse_choice(); !result.is_error())
+ return result.release_value();
+ return TRY(parse_sequence());
+ };
+ auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier {
+ ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce };
+ if (m_lexer.consume_specific('?'))
+ qualifier = ElementDeclaration::Children::Qualifier::Optional;
+ else if (m_lexer.consume_specific('*'))
+ qualifier = ElementDeclaration::Children::Qualifier::Any;
+ else if (m_lexer.consume_specific('+'))
+ qualifier = ElementDeclaration::Children::Qualifier::OneOrMore;
+ return qualifier;
+ };
+ auto parse_cp = [&]() -> ErrorOr<ElementDeclaration::Children::Entry, ParseError> {
+ auto sub_entry = TRY(parse_cp_init());
+ auto qualifier = parse_qualifier();
+ return ElementDeclaration::Children::Entry {
+ move(sub_entry),
+ qualifier,
+ };
+ };
+ parse_choice = [&]() -> ErrorOr<ElementDeclaration::Children::Choice, ParseError> {
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ TRY(expect("("));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace());
+ Vector<ElementDeclaration::Children::Entry> choices;
+ choices.append(TRY(parse_cp()));
+ while (true) {
+ TRY(skip_whitespace());
+ if (!m_lexer.consume_specific('|'))
+ break;
+ TRY(skip_whitespace());
+ choices.append(TRY(parse_cp()));
+ }
+
+ TRY(expect(")"));
+
+ if (choices.size() < 2)
+ return parse_error(m_lexer.tell(), "Expected more than one choice");
+
+ TRY(skip_whitespace());
+ auto qualifier = parse_qualifier();
+
+ rollback.disarm();
+ return ElementDeclaration::Children::Choice {
+ move(choices),
+ qualifier,
+ };
+ };
+ parse_sequence = [&]() -> ErrorOr<ElementDeclaration::Children::Sequence, ParseError> {
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ TRY(expect("("));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace());
+ Vector<ElementDeclaration::Children::Entry> entries;
+ entries.append(TRY(parse_cp()));
+ while (true) {
+ TRY(skip_whitespace());
+ if (!m_lexer.consume_specific(','))
+ break;
+ TRY(skip_whitespace());
+ entries.append(TRY(parse_cp()));
+ }
+
+ TRY(expect(")"));
+
+ TRY(skip_whitespace());
+ auto qualifier = parse_qualifier();
+
+ rollback.disarm();
+ return ElementDeclaration::Children::Sequence {
+ move(entries),
+ qualifier,
+ };
+ };
+ if (auto result = parse_choice(); !result.is_error()) {
+ auto qualifier = parse_qualifier();
+ content_spec = ElementDeclaration::Children {
+ result.release_value(),
+ qualifier,
+ };
+ } else {
+ auto sequence = TRY(parse_sequence());
+ auto qualifier = parse_qualifier();
+ content_spec = ElementDeclaration::Children {
+ move(sequence),
+ qualifier,
+ };
+ }
+ }
+ }
+
+ rollback.disarm();
+ return content_spec.release_value();
+}
+
+// 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl
+ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Vector<MarkupDeclaration> declarations;
+
+ // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )*
+ while (true) {
+ if (auto result = parse_markup_declaration(); !result.is_error()) {
+ if (result.value().has_value())
+ declarations.append(result.release_value().release_value());
+ continue;
+ }
+
+ // FIXME: conditionalSect
+
+ if (auto result = parse_declaration_separator(); !result.is_error())
+ continue;
+
+ break;
+ }
+
+ rollback.disarm();
+ return declarations;
+}
+
+// 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl
+ErrorOr<EntityDeclaration, ParseError> Parser::parse_entity_declaration()
+{
+ // EntityDecl ::= GEDecl | PEDecl
+ if (auto result = parse_general_entity_declaration(); !result.is_error())
+ return result;
+
+ return parse_parameter_entity_declaration();
+}
+
+// 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl
+ErrorOr<EntityDeclaration, ParseError> Parser::parse_general_entity_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ Variant<String, EntityDefinition, Empty> definition;
+
+ // GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
+ TRY(expect("<!ENTITY"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ auto name = TRY(parse_name());
+ TRY(skip_whitespace(Required::Yes));
+ // EntityDef ::= EntityValue | (ExternalID NDataDecl?)
+ if (auto result = parse_entity_value(); !result.is_error()) {
+ definition = result.release_value();
+ } else {
+ auto external_id = TRY(parse_external_id());
+ Optional<Name> notation;
+ if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error())
+ notation = notation_result.release_value();
+
+ definition = EntityDefinition {
+ move(external_id),
+ move(notation),
+ };
+ }
+
+ TRY(skip_whitespace());
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return GEDeclaration {
+ move(name),
+ move(definition).downcast<String, EntityDefinition>(),
+ };
+}
+
+// 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl
+ErrorOr<EntityDeclaration, ParseError> Parser::parse_parameter_entity_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ Variant<String, ExternalID, Empty> definition;
+ // PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
+ TRY(expect("<!ENTITY"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ TRY(expect("%"));
+ TRY(skip_whitespace(Required::Yes));
+ auto name = TRY(parse_name());
+ TRY(skip_whitespace(Required::Yes));
+ // PEDef ::= EntityValue | ExternalID
+ if (auto result = parse_entity_value(); !result.is_error())
+ definition = result.release_value();
+ else
+ definition = TRY(parse_external_id());
+
+ TRY(skip_whitespace());
+ TRY(expect(">"));
+
+ rollback.disarm();
+ return PEDeclaration {
+ move(name),
+ move(definition).downcast<String, ExternalID>(),
+ };
+}
+
+// 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID
+ErrorOr<PublicID, ParseError> Parser::parse_public_id()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // PublicID ::= 'PUBLIC' S PubidLiteral
+ TRY(expect("PUBLIC"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ auto text = TRY(parse_public_id_literal());
+
+ rollback.disarm();
+ return PublicID {
+ text,
+ };
+}
+
+constexpr static auto s_public_id_characters = set_to_search<StringSet("\x20\x0d\x0a-'()+,./:=?;!*#@$_%")>().unify(ranges_for_search<Range('a', 'z'), Range('A', 'Z'), Range('0', '9')>());
+
+// 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral
+ErrorOr<StringView, ParseError> Parser::parse_public_id_literal()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
+ auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
+ auto accept = accept_rule();
+
+ auto id = TRY(expect_many(
+ [q = quote[0]](auto x) {
+ return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x);
+ },
+ "a PubidChar"));
+ TRY(expect(quote));
+
+ rollback.disarm();
+ return id;
+}
+
+// 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral
+ErrorOr<StringView, ParseError> Parser::parse_system_id_literal()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
+ auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
+ auto accept = accept_rule();
+
+ auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"));
+ TRY(expect(quote));
+
+ rollback.disarm();
+ return id;
+}
+
+// 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID
+ErrorOr<ExternalID, ParseError> Parser::parse_external_id()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // ExternalID ::= 'SYSTEM' S SystemLiteral
+ // | 'PUBLIC' S PubidLiteral S SystemLiteral
+ Optional<PublicID> public_id;
+ SystemID system_id;
+
+ if (m_lexer.consume_specific("SYSTEM")) {
+ auto accept = accept_rule();
+ TRY(skip_whitespace(Required::Yes));
+ system_id = SystemID { TRY(parse_system_id_literal()) };
+ } else {
+ TRY(expect("PUBLIC"));
+ auto accept = accept_rule();
+
+ TRY(skip_whitespace(Required::Yes));
+ public_id = PublicID { TRY(parse_public_id_literal()) };
+ TRY(skip_whitespace(Required::Yes));
+ system_id = SystemID { TRY(parse_system_id_literal()) };
+ }
+
+ rollback.disarm();
+ return ExternalID {
+ move(public_id),
+ move(system_id),
+ };
+}
+
+// 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl
+ErrorOr<Name, ParseError> Parser::parse_notation_data_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // NDataDecl ::= S 'NDATA' S Name
+ TRY(skip_whitespace(Required::Yes));
+ auto accept = accept_rule();
+
+ TRY(expect("NDATA"));
+ TRY(skip_whitespace(Required::Yes));
+ auto name = TRY(parse_name());
+
+ rollback.disarm();
+ return name;
+}
+
+// 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue
+ErrorOr<String, ParseError> Parser::parse_entity_value()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+ StringBuilder builder;
+
+ // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
+ // | "'" ([^%&'] | PEReference | Reference)* "'"
+ auto quote = TRY(expect(is_any_of("'\""), "any of ' or \""));
+ auto accept = accept_rule();
+
+ while (true) {
+ if (m_lexer.is_eof())
+ break;
+ if (m_lexer.next_is(quote))
+ break;
+ if (m_lexer.next_is('%')) {
+ auto start = m_lexer.tell();
+ TRY(parse_parameter_entity_reference());
+ builder.append(m_source.substring_view(start, m_lexer.tell() - start));
+ continue;
+ }
+ if (m_lexer.next_is('&')) {
+ auto start = m_lexer.tell();
+ TRY(parse_reference());
+ builder.append(m_source.substring_view(start, m_lexer.tell() - start));
+ continue;
+ }
+ builder.append(m_lexer.consume());
+ }
+ TRY(expect(quote));
+
+ rollback.disarm();
+ return builder.to_string();
+}
+
+// 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect
+ErrorOr<StringView, ParseError> Parser::parse_cdata_section()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // CDSect ::= CDStart CData CDEnd
+ // CDStart ::= '<![CDATA['
+ // CData ::= (Char* - (Char* ']]>' Char*))
+ // CDEnd ::= ']]>'
+ TRY(expect("<![CDATA["));
+ auto accept = accept_rule();
+
+ auto section_start = m_lexer.tell();
+ while (!m_lexer.next_is("]]>")) {
+ if (m_lexer.is_eof())
+ break;
+ m_lexer.ignore();
+ }
+ auto section_end = m_lexer.tell();
+ TRY(expect("]]>"));
+
+ rollback.disarm();
+ return m_source.substring_view(section_start, section_end - section_start);
+}
+
+// 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset
+ErrorOr<Vector<MarkupDeclaration>, ParseError> Parser::parse_external_subset()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // extSubset ::= TextDecl? extSubsetDecl
+ (void)parse_text_declaration();
+ auto result = TRY(parse_external_subset_declaration());
+
+ rollback.disarm();
+ return result;
+}
+
+// 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl
+ErrorOr<void, ParseError> Parser::parse_text_declaration()
+{
+ auto rollback = rollback_point();
+ auto rule = enter_rule();
+
+ // TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>'
+ TRY(expect("<?xml"));
+ auto accept = accept_rule();
+
+ (void)parse_version_info();
+ TRY(parse_encoding_decl());
+ TRY(skip_whitespace());
+ TRY(expect("?>"));
+
+ rollback.disarm();
+ return {};
+}
+
+ErrorOr<String, ParseError> Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement)
+{
+ static HashTable<Name> reference_lookup {};
+ if (reference_lookup.contains(reference.name))
+ return parse_error(m_lexer.tell(), String::formatted("Invalid recursive definition for '{}'", reference.name));
+
+ reference_lookup.set(reference.name);
+ ScopeGuard remove_lookup {
+ [&] {
+ reference_lookup.remove(reference.name);
+ }
+ };
+
+ Optional<String> resolved;
+ if (m_doctype.has_value()) {
+ // FIXME: Split these up and resolve them ahead of time.
+ for (auto& declaration : m_doctype->markup_declarations) {
+ auto entity = declaration.get_pointer<EntityDeclaration>();
+ if (!entity)
+ continue;
+ auto ge_declaration = entity->get_pointer<GEDeclaration>();
+ if (!ge_declaration)
+ continue;
+ if (ge_declaration->name != reference.name)
+ continue;
+ TRY(ge_declaration->definition.visit(
+ [&](String const& definition) -> ErrorOr<void, ParseError> {
+ resolved = definition;
+ return {};
+ },
+ [&](EntityDefinition const& definition) -> ErrorOr<void, ParseError> {
+ if (placement == ReferencePlacement::AttributeValue)
+ return parse_error(m_lexer.tell(), String::formatted("Attribute references external entity '{}'", reference.name));
+
+ if (definition.notation.has_value())
+ return parse_error(0u, String::formatted("Entity reference to unparsed entity '{}'", reference.name));
+
+ if (!m_options.resolve_external_resource)
+ return parse_error(0u, String::formatted("Failed to resolve external entity '{}'", reference.name));
+
+ auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id);
+ if (result.is_error())
+ return parse_error(0u, String::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error()));
+
+ resolved = result.release_value();
+ return {};
+ }));
+ break;
+ }
+ }
+
+ if (!resolved.has_value()) {
+ if (reference.name == "amp")
+ return "&";
+ if (reference.name == "lt")
+ return "<";
+ if (reference.name == "gt")
+ return ">";
+ if (reference.name == "apos")
+ return "'";
+ if (reference.name == "quot")
+ return "\"";
+ return parse_error(0u, String::formatted("Reference to undeclared entity '{}'", reference.name));
+ }
+
+ StringView resolved_source = *resolved;
+ TemporaryChange source { m_source, resolved_source };
+ TemporaryChange lexer { m_lexer, GenericLexer(m_source) };
+ switch (placement) {
+ case ReferencePlacement::AttributeValue:
+ return TRY(parse_attribute_value_inner(""));
+ case ReferencePlacement::Content:
+ TRY(parse_content());
+ return "";
+ default:
+ VERIFY_NOT_REACHED();
+ }
+}
+
+}
diff --git a/Userland/Libraries/LibXML/Parser/Parser.h b/Userland/Libraries/LibXML/Parser/Parser.h
new file mode 100644
index 0000000000..f2579034fa
--- /dev/null
+++ b/Userland/Libraries/LibXML/Parser/Parser.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2022, Ali Mohammad Pur <mpfard@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Debug.h>
+#include <AK/Function.h>
+#include <AK/GenericLexer.h>
+#include <AK/HashMap.h>
+#include <AK/OwnPtr.h>
+#include <AK/SourceLocation.h>
+#include <AK/String.h>
+#include <AK/TemporaryChange.h>
+#include <LibXML/DOM/Document.h>
+#include <LibXML/DOM/DocumentTypeDeclaration.h>
+#include <LibXML/DOM/Node.h>
+#include <LibXML/Forward.h>
+
+namespace XML {
+
+struct ParseError {
+ size_t offset;
+ String error;
+};
+
+struct Listener {
+ virtual ~Listener() { }
+
+ virtual void document_start() { }
+ virtual void document_end() { }
+ virtual void element_start(Name const&, HashMap<Name, String> const&) { }
+ virtual void element_end(Name const&) { }
+ virtual void text(String const&) { }
+ virtual void comment(String const&) { }
+ virtual void error(ParseError const&) { }
+};
+
+class Parser {
+public:
+ struct Options {
+ bool preserve_cdata { true };
+ bool preserve_comments { false };
+ bool treat_errors_as_fatal { true };
+ Function<ErrorOr<String>(SystemID const&, Optional<PublicID> const&)> resolve_external_resource {};
+ };
+
+ Parser(StringView source, Options options)
+ : m_source(source)
+ , m_lexer(source)
+ , m_options(move(options))
+ {
+ }
+
+ explicit Parser(StringView source)
+ : m_source(source)
+ , m_lexer(source)
+ {
+ }
+
+ ErrorOr<Document, ParseError> parse();
+ ErrorOr<void, ParseError> parse_with_listener(Listener&);
+
+ Vector<ParseError> const& parse_error_causes() const { return m_parse_errors; }
+
+private:
+ struct EntityReference {
+ Name name;
+ };
+
+ ErrorOr<void, ParseError> parse_internal();
+ void append_node(NonnullOwnPtr<Node>);
+ void append_text(String);
+ void append_comment(String);
+ void enter_node(Node&);
+ void leave_node();
+
+ enum class ReferencePlacement {
+ AttributeValue,
+ Content,
+ };
+ ErrorOr<String, ParseError> resolve_reference(EntityReference const&, ReferencePlacement);
+ ErrorOr<String, ParseError> resolve_parameter_entity_reference(EntityReference const&);
+
+ enum class Required {
+ No,
+ Yes,
+ };
+ ErrorOr<void, ParseError> skip_whitespace(Required = Required::No);
+
+ ErrorOr<void, ParseError> parse_prolog();
+ ErrorOr<void, ParseError> parse_element();
+ ErrorOr<void, ParseError> parse_misc();
+ ErrorOr<void, ParseError> parse_xml_decl();
+ ErrorOr<void, ParseError> parse_doctype_decl();
+ ErrorOr<void, ParseError> parse_version_info();
+ ErrorOr<void, ParseError> parse_encoding_decl();
+ ErrorOr<void, ParseError> parse_standalone_document_decl();
+ ErrorOr<void, ParseError> parse_eq();
+ ErrorOr<void, ParseError> parse_comment();
+ ErrorOr<void, ParseError> parse_processing_instruction();
+ ErrorOr<Name, ParseError> parse_processing_instruction_target();
+ ErrorOr<Name, ParseError> parse_name();
+ ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_empty_element_tag();
+ ErrorOr<NonnullOwnPtr<Node>, ParseError> parse_start_tag();
+ ErrorOr<Name, ParseError> parse_end_tag();
+ ErrorOr<void, ParseError> parse_content();
+ ErrorOr<Attribute, ParseError> parse_attribute();
+ ErrorOr<String, ParseError> parse_attribute_value();
+ ErrorOr<Variant<EntityReference, String>, ParseError> parse_reference();
+ ErrorOr<StringView, ParseError> parse_char_data();
+ ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_internal_subset();
+ ErrorOr<Optional<MarkupDeclaration>, ParseError> parse_markup_declaration();
+ ErrorOr<Optional<String>, ParseError> parse_declaration_separator();
+ ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset_declaration();
+ ErrorOr<ElementDeclaration, ParseError> parse_element_declaration();
+ ErrorOr<AttributeListDeclaration, ParseError> parse_attribute_list_declaration();
+ ErrorOr<EntityDeclaration, ParseError> parse_entity_declaration();
+ ErrorOr<NotationDeclaration, ParseError> parse_notation_declaration();
+ ErrorOr<Name, ParseError> parse_parameter_entity_reference();
+ ErrorOr<ElementDeclaration::ContentSpec, ParseError> parse_content_spec();
+ ErrorOr<AttributeListDeclaration::Definition, ParseError> parse_attribute_definition();
+ ErrorOr<StringView, ParseError> parse_nm_token();
+ ErrorOr<EntityDeclaration, ParseError> parse_general_entity_declaration();
+ ErrorOr<EntityDeclaration, ParseError> parse_parameter_entity_declaration();
+ ErrorOr<PublicID, ParseError> parse_public_id();
+ ErrorOr<SystemID, ParseError> parse_system_id();
+ ErrorOr<ExternalID, ParseError> parse_external_id();
+ ErrorOr<String, ParseError> parse_entity_value();
+ ErrorOr<Name, ParseError> parse_notation_data_declaration();
+ ErrorOr<StringView, ParseError> parse_public_id_literal();
+ ErrorOr<StringView, ParseError> parse_system_id_literal();
+ ErrorOr<StringView, ParseError> parse_cdata_section();
+ ErrorOr<String, ParseError> parse_attribute_value_inner(StringView disallow);
+ ErrorOr<Vector<MarkupDeclaration>, ParseError> parse_external_subset();
+ ErrorOr<void, ParseError> parse_text_declaration();
+
+ ErrorOr<void, ParseError> expect(StringView);
+ template<typename Pred>
+ requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> expect(Pred, StringView description);
+ template<typename Pred>
+ requires(IsCallableWithArguments<Pred, char>) ErrorOr<StringView, ParseError> expect_many(Pred, StringView description);
+
+ static size_t s_debug_indent_level;
+ [[nodiscard]] auto rollback_point(SourceLocation location = SourceLocation::current())
+ {
+ return ArmedScopeGuard {
+ [this, position = m_lexer.tell(), location] {
+ m_lexer.retreat(m_lexer.tell() - position);
+ (void)location;
+ dbgln_if(XML_PARSER_DEBUG, "{:->{}}FAIL @ {} -- \x1b[31m{}\x1b[0m", " ", s_debug_indent_level * 2, location, m_lexer.remaining().substring_view(0, min(16, m_lexer.tell_remaining())).replace("\n", "\\n", true));
+ }
+ };
+ }
+
+ [[nodiscard]] auto accept_rule()
+ {
+ return TemporaryChange { m_current_rule.accept, true };
+ }
+ [[nodiscard]] auto enter_rule(SourceLocation location = SourceLocation::current())
+ {
+ dbgln_if(XML_PARSER_DEBUG, "{:->{}}Enter {}", " ", s_debug_indent_level * 2, location);
+ ++s_debug_indent_level;
+ auto rule = m_current_rule;
+ m_current_rule = { location.function_name(), false };
+ return ScopeGuard {
+ [location, rule, this] {
+ m_current_rule = rule;
+ --s_debug_indent_level;
+ (void)location;
+ dbgln_if(XML_PARSER_DEBUG, "{:->{}}Leave {}", " ", s_debug_indent_level * 2, location);
+ }
+ };
+ }
+
+ template<typename... Ts>
+ ParseError parse_error(Ts&&... args)
+ {
+ auto error = ParseError { forward<Ts>(args)... };
+ if (m_current_rule.accept) {
+ auto rule_name = m_current_rule.rule.value_or("<?>");
+ if (rule_name.starts_with("parse_"))
+ rule_name = rule_name.substring_view(6);
+ m_parse_errors.append({
+ error.offset,
+ String::formatted("{}: {}", rule_name, error.error),
+ });
+ }
+ return error;
+ }
+
+ StringView m_source;
+ GenericLexer m_lexer;
+ Options m_options;
+ Listener* m_listener { nullptr };
+
+ OwnPtr<Node> m_root_node;
+ Node* m_entered_node { nullptr };
+ Version m_version { Version::Version11 };
+ bool m_in_compatibility_mode { false };
+ String m_encoding;
+ bool m_standalone { false };
+ HashMap<Name, String> m_processing_instructions;
+ struct AcceptedRule {
+ Optional<String> rule {};
+ bool accept { false };
+ } m_current_rule {};
+
+ Vector<ParseError> m_parse_errors;
+
+ Optional<Doctype> m_doctype;
+};
+}
+
+template<>
+struct AK::Formatter<XML::ParseError> : public AK::Formatter<FormatString> {
+ ErrorOr<void> format(FormatBuilder& builder, XML::ParseError const& error)
+ {
+ return Formatter<FormatString>::format(builder, "{} at offset {}", error.error, error.offset);
+ }
+};