/* * Copyright (c) 2022, Ali Mohammad Pur * * SPDX-License-Identifier: BSD-2-Clause */ #include #include struct Range { consteval Range(u32 start, u32 end) : start(start) , end(end) { } u32 start; u32 end; }; template struct ranges_for_search { auto contains(u32 value) const { return ((value >= ranges.start && value <= ranges.end) || ...); } bool operator()(u32 value) const { return contains(value); } template consteval auto with() const { return ranges_for_search(); } template consteval auto unify(ranges_for_search const&) const { return ranges_for_search(); } }; template struct StringSet { consteval StringSet(Element const (&entries)[Count]) { for (size_t i = 0; i < Count - 1; ++i) elements[i] = entries[i]; } consteval auto operator[](size_t i) const { return elements[i]; } Element elements[Count - 1]; }; template consteval static auto set_to_search() { return ([&](IndexSequence) { return ranges_for_search(); }(MakeIndexSequence())); } namespace XML { size_t Parser::s_debug_indent_level { 0 }; void Parser::append_node(NonnullOwnPtr node) { if (m_entered_node) { m_entered_node->content.get().children.append(move(node)); } else { m_root_node = move(node); m_entered_node = m_root_node.ptr(); } } void Parser::append_text(StringView text) { if (m_listener) { m_listener->text(text); return; } if (!m_entered_node) { Node::Text node; node.builder.append(text); m_root_node = make(move(node)); return; } m_entered_node->content.visit( [&](Node::Element& node) { if (!node.children.is_empty()) { auto* text_node = node.children.last().content.get_pointer(); if (text_node) { text_node->builder.append(text); return; } } Node::Text text_node; text_node.builder.append(text); node.children.append(make(move(text_node))); }, [&](auto&) { // Can't enter a text or comment node. VERIFY_NOT_REACHED(); }); } void Parser::append_comment(StringView text) { if (m_listener) { m_listener->comment(text); return; } // If there's no node to attach this to, drop it on the floor. // This can happen to comments in the prolog. if (!m_entered_node) return; m_entered_node->content.visit( [&](Node::Element& node) { node.children.append(make(Node::Comment { text })); }, [&](auto&) { // Can't enter a text or comment node. VERIFY_NOT_REACHED(); }); } void Parser::enter_node(Node& node) { if (m_listener) { auto& element = node.content.get(); m_listener->element_start(element.name, element.attributes); } if (&node != m_root_node.ptr()) node.parent = m_entered_node; m_entered_node = &node; } void Parser::leave_node() { if (m_listener) { auto& element = m_entered_node->content.get(); m_listener->element_end(element.name); } m_entered_node = m_entered_node->parent; } ErrorOr Parser::parse() { if (auto result = parse_internal(); result.is_error()) { if (m_parse_errors.is_empty()) return result.release_error(); return m_parse_errors.take_first(); } return Document { m_root_node.release_nonnull(), move(m_doctype), move(m_processing_instructions), m_version, }; } ErrorOr Parser::parse_with_listener(Listener& listener) { m_listener = &listener; ScopeGuard unset_listener { [this] { m_listener = nullptr; } }; m_listener->set_source(m_source); m_listener->document_start(); auto result = parse_internal(); if (result.is_error()) m_listener->error(result.error()); m_listener->document_end(); m_root_node.clear(); return result; } // 2.3.3. S, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-S ErrorOr Parser::skip_whitespace(Required required) { auto rollback = rollback_point(); auto rule = enter_rule(); // S ::= (#x20 | #x9 | #xD | #xA)+ auto matched = m_lexer.consume_while(is_any_of("\x20\x09\x0d\x0a"sv)); if (required == Required::Yes && matched.is_empty()) return parse_error(m_lexer.tell(), "Expected whitespace"); rollback.disarm(); return {}; } // 2.2.a. RestrictedChar, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-RestrictedChar constexpr static auto s_restricted_characters = ranges_for_search(); // 2.1.1. Document, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-well-formed ErrorOr Parser::parse_internal() { auto rule = enter_rule(); // document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) TRY(parse_prolog()); TRY(parse_element()); while (true) { if (auto result = parse_misc(); result.is_error()) break; } auto matched_source = m_source.substring_view(0, m_lexer.tell()); if (auto it = find_if(matched_source.begin(), matched_source.end(), s_restricted_characters); !it.is_end()) { return parse_error( it.index(), DeprecatedString::formatted("Invalid character #{:x} used in document", *it)); } if (!m_lexer.is_eof()) return parse_error(m_lexer.tell(), "Garbage after document"); return {}; } ErrorOr Parser::expect(StringView expected) { auto rollback = rollback_point(); if (!m_lexer.consume_specific(expected)) { if (m_options.treat_errors_as_fatal) return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '{}'", expected)); } rollback.disarm(); return {}; } template requires(IsCallableWithArguments) ErrorOr Parser::expect(Pred predicate, StringView description) { auto rollback = rollback_point(); auto start = m_lexer.tell(); if (!m_lexer.next_is(predicate)) { if (m_options.treat_errors_as_fatal) return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description)); } m_lexer.ignore(); rollback.disarm(); return m_source.substring_view(start, m_lexer.tell() - start); } template requires(IsCallableWithArguments) ErrorOr Parser::expect_many(Pred predicate, StringView description) { auto rollback = rollback_point(); auto start = m_lexer.tell(); while (m_lexer.next_is(predicate)) { if (m_lexer.is_eof()) break; m_lexer.ignore(); } if (m_lexer.tell() == start) { if (m_options.treat_errors_as_fatal) { return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected {}", description)); } } rollback.disarm(); return m_source.substring_view(start, m_lexer.tell() - start); } // 2.8.22. Prolog, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ErrorOr Parser::parse_prolog() { auto rollback = rollback_point(); auto rule = enter_rule(); // prolog ::= XMLDecl Misc* (doctypedecl Misc*)? // The following is valid in XML 1.0. // prolog ::= XMLDecl? Misc* (doctypedecl Misc*)? if (auto result = parse_xml_decl(); result.is_error()) { m_version = Version::Version10; m_in_compatibility_mode = true; } auto accept = accept_rule(); while (true) { if (auto result = parse_misc(); result.is_error()) break; } if (auto result = parse_doctype_decl(); !result.is_error()) { while (true) { if (auto result = parse_misc(); result.is_error()) break; } } rollback.disarm(); return {}; } // 2.8.23. XMLDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ErrorOr Parser::parse_xml_decl() { auto rollback = rollback_point(); auto rule = enter_rule(); // XMLDecl::= '' TRY(expect(""sv)); rollback.disarm(); return {}; } // 2.8.24. VersionInfo, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-VersionInfo ErrorOr Parser::parse_version_info() { auto rollback = rollback_point(); auto rule = enter_rule(); // VersionInfo ::= S 'version' Eq ("'" VersionNum "'" | '"' VersionNum '"') TRY(skip_whitespace(Required::Yes)); TRY(expect("version"sv)); auto accept = accept_rule(); TRY(parse_eq()); TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); m_lexer.retreat(); auto version_string = m_lexer.consume_quoted_string(); if (version_string == "1.0") { // FIXME: Compatibility mode, figure out which rules are different in XML 1.0. m_version = Version::Version10; m_in_compatibility_mode = true; } else { if (version_string != "1.1" && m_options.treat_errors_as_fatal) return parse_error(m_lexer.tell(), DeprecatedString::formatted("Expected '1.1', found '{}'", version_string)); } m_version = Version::Version11; rollback.disarm(); return {}; } // 2.8.25. Eq, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Eq ErrorOr Parser::parse_eq() { auto rollback = rollback_point(); auto rule = enter_rule(); // Eq ::= S? '=' S? auto accept = accept_rule(); TRY(skip_whitespace()); TRY(expect("="sv)); TRY(skip_whitespace()); rollback.disarm(); return {}; } // 4.3.3.80. EncodingDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EncodingDecl ErrorOr Parser::parse_encoding_decl() { auto rollback = rollback_point(); auto rule = enter_rule(); // EncodingDecl ::= S 'encoding' Eq ('"' EncName '"' | "'" EncName "'" ) TRY(skip_whitespace(Required::Yes)); TRY(expect("encoding"sv)); auto accept = accept_rule(); TRY(parse_eq()); TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); m_lexer.retreat(); // FIXME: Actually do something with this encoding. m_encoding = m_lexer.consume_quoted_string(); rollback.disarm(); return {}; } // 2.9.32 SDDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#sec-rmd ErrorOr Parser::parse_standalone_document_decl() { auto rollback = rollback_point(); auto rule = enter_rule(); // SDDecl ::= S 'standalone' Eq (("'" ('yes' | 'no') "'") | ('"' ('yes' | 'no') '"')) TRY(skip_whitespace(Required::Yes)); TRY(expect("standalone"sv)); auto accept = accept_rule(); TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); m_lexer.retreat(); auto value = m_lexer.consume_quoted_string(); if (!value.is_one_of("yes", "no")) return parse_error(m_lexer.tell() - value.length(), "Expected one of 'yes' or 'no'"); m_standalone = value == "yes"; rollback.disarm(); return {}; } // 2.8.27. Misc, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Misc ErrorOr Parser::parse_misc() { auto rollback = rollback_point(); auto rule = enter_rule(); // Misc ::= Comment | PI | S if (auto result = parse_comment(); !result.is_error()) { rollback.disarm(); return {}; } if (auto result = parse_processing_instruction(); !result.is_error()) { rollback.disarm(); return {}; } if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { rollback.disarm(); return {}; } return parse_error(m_lexer.tell(), "Expected a match for 'Misc', but found none"); } // 2.5.15 Comment, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Comment ErrorOr Parser::parse_comment() { auto rollback = rollback_point(); auto rule = enter_rule(); // Comment ::= '' TRY(expect(""sv)); if (m_options.preserve_comments) append_comment(text); rollback.disarm(); return {}; } // 2.6.16 PI, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PI ErrorOr Parser::parse_processing_instruction() { auto rollback = rollback_point(); auto rule = enter_rule(); // PI ::= '' Char*)))? '?>' TRY(expect(""); TRY(expect("?>"sv)); m_processing_instructions.set(target, data); rollback.disarm(); return {}; } // 2.6.17. PITarget, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PITarget ErrorOr Parser::parse_processing_instruction_target() { auto rollback = rollback_point(); auto rule = enter_rule(); // PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l')) auto target = TRY(parse_name()); auto accept = accept_rule(); if (target.equals_ignoring_case("xml"sv) && m_options.treat_errors_as_fatal) { return parse_error( m_lexer.tell() - target.length(), "Use of the reserved 'xml' name for processing instruction target name is disallowed"); } rollback.disarm(); return target; } // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF] constexpr static auto s_name_start_characters = ranges_for_search {}; // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] constexpr static auto s_name_characters = s_name_start_characters.with(); // 2.3.5. Name, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Name ErrorOr Parser::parse_name() { auto rollback = rollback_point(); auto rule = enter_rule(); // Name ::= NameStartChar (NameChar)* auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv)); auto accept = accept_rule(); auto rest = m_lexer.consume_while(s_name_characters); StringBuilder builder; builder.append(start); builder.append(rest); rollback.disarm(); return builder.to_deprecated_string(); } // 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl ErrorOr Parser::parse_doctype_decl() { auto rollback = rollback_point(); auto rule = enter_rule(); Doctype doctype; // doctypedecl ::= '' TRY(expect("system_id, doctype.external_id->public_id); if (resource_result.is_error()) { return parse_error( id_start, DeprecatedString::formatted("Failed to resolve external subset '{}': {}", doctype.external_id->system_id.system_literal, resource_result.error())); } StringView resolved_source = resource_result.value(); TemporaryChange source { m_source, resolved_source }; TemporaryChange lexer { m_lexer, GenericLexer(m_source) }; auto declarations = TRY(parse_external_subset()); if (!m_lexer.is_eof()) { return parse_error( m_lexer.tell(), DeprecatedString::formatted("Failed to resolve external subset '{}': garbage after declarations", doctype.external_id->system_id.system_literal)); } doctype.markup_declarations.extend(move(declarations)); } } } TRY(skip_whitespace(Required::No)); if (m_lexer.consume_specific('[')) { auto internal_subset = TRY(parse_internal_subset()); TRY(expect("]"sv)); TRY(skip_whitespace()); doctype.markup_declarations.extend(internal_subset); } TRY(expect(">"sv)); rollback.disarm(); m_doctype = move(doctype); return {}; } // 3.39. element, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-element ErrorOr Parser::parse_element() { auto rollback = rollback_point(); auto rule = enter_rule(); // element ::= EmptyElemTag // | STag content ETag if (auto result = parse_empty_element_tag(); !result.is_error()) { append_node(result.release_value()); rollback.disarm(); return {}; } auto start_tag = TRY(parse_start_tag()); auto& node = *start_tag; auto& tag = node.content.get(); append_node(move(start_tag)); enter_node(node); ScopeGuard quit { [&] { leave_node(); } }; TRY(parse_content()); auto tag_location = m_lexer.tell(); auto closing_name = TRY(parse_end_tag()); // Well-formedness constraint: The Name in an element's end-tag MUST match the element type in the start-tag. if (m_options.treat_errors_as_fatal && closing_name != tag.name) return parse_error(tag_location, "Invalid closing tag"); rollback.disarm(); return {}; } // 3.1.44. EmptyElemTag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EmptyElemTag ErrorOr, ParseError> Parser::parse_empty_element_tag() { auto rollback = rollback_point(); auto rule = enter_rule(); // EmptyElemTag ::= '<' Name (S Attribute)* S? '/>' TRY(expect("<"sv)); auto accept = accept_rule(); auto name = TRY(parse_name()); HashMap attributes; while (true) { if (auto result = skip_whitespace(Required::Yes); result.is_error()) break; if (auto result = parse_attribute(); !result.is_error()) { auto attribute = result.release_value(); attributes.set(move(attribute.name), move(attribute.value)); } else { break; } } TRY(skip_whitespace()); TRY(expect("/>"sv)); rollback.disarm(); return make(Node::Element { move(name), move(attributes), {} }); } // 3.1.41. Attribute, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Attribute ErrorOr Parser::parse_attribute() { auto rollback = rollback_point(); auto rule = enter_rule(); // Attribute ::= Name Eq AttValue auto name = TRY(parse_name()); auto accept = accept_rule(); TRY(parse_eq()); auto value = TRY(parse_attribute_value()); rollback.disarm(); return Attribute { move(name), move(value), }; } // 2.3.10. AttValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttValue ErrorOr Parser::parse_attribute_value() { auto rollback = rollback_point(); auto rule = enter_rule(); // AttValue ::= '"' ([^<&"] | Reference)* '"' // | "'" ([^<&'] | Reference)* "'" auto quote = TRY(expect(is_any_of("'\""sv), "one of ' or \""sv)); auto accept = accept_rule(); auto text = TRY(parse_attribute_value_inner(quote)); TRY(expect(quote)); rollback.disarm(); return text; } ErrorOr Parser::parse_attribute_value_inner(StringView disallow) { StringBuilder builder; while (true) { if (m_lexer.next_is(is_any_of(disallow)) || m_lexer.is_eof()) break; if (m_lexer.next_is('<')) { // Not allowed, return a nice error to make it easier to debug. return parse_error(m_lexer.tell(), "Unescaped '<' not allowed in attribute values"); } if (m_lexer.next_is('&')) { auto reference = TRY(parse_reference()); if (auto* char_reference = reference.get_pointer()) builder.append(*char_reference); else builder.append(TRY(resolve_reference(reference.get(), ReferencePlacement::AttributeValue))); } else { builder.append(m_lexer.consume()); } } return builder.to_deprecated_string(); } // Char ::= [#x1-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] constexpr static auto s_characters = ranges_for_search(); // 4.1.67. Reference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Reference ErrorOr, ParseError> Parser::parse_reference() { auto rollback = rollback_point(); auto rule = enter_rule(); // Reference ::= EntityRef | CharRef // 4.1.68. EntityRef // EntityRef ::= '&' Name ';' // 4.1.66. CharRef // CharRef ::= '&#' [0-9]+ ';' // | '&#x' [0-9a-fA-F]+ ';' auto reference_start = m_lexer.tell(); TRY(expect("&"sv)); auto accept = accept_rule(); auto name_result = parse_name(); if (name_result.is_error()) { TRY(expect("#"sv)); Optional code_point; if (m_lexer.consume_specific('x')) { auto hex = TRY(expect_many( ranges_for_search(), "any of [0-9a-fA-F]"sv)); code_point = AK::StringUtils::convert_to_uint_from_hex(hex); } else { auto decimal = TRY(expect_many( ranges_for_search(), "any of [0-9]"sv)); code_point = decimal.to_uint(); } if (!code_point.has_value() || !s_characters.contains(*code_point)) return parse_error(reference_start, "Invalid character reference"); TRY(expect(";"sv)); StringBuilder builder; builder.append_code_point(*code_point); rollback.disarm(); return builder.to_deprecated_string(); } auto name = name_result.release_value(); TRY(expect(";"sv)); rollback.disarm(); return EntityReference { move(name) }; } // 3.1.40 STag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-STag ErrorOr, ParseError> Parser::parse_start_tag() { auto rollback = rollback_point(); auto rule = enter_rule(); // STag ::= '<' Name (S Attribute)* S? '>' TRY(expect("<"sv)); auto accept = accept_rule(); auto name = TRY(parse_name()); HashMap attributes; while (true) { if (auto result = skip_whitespace(Required::Yes); result.is_error()) break; if (auto result = parse_attribute(); !result.is_error()) { auto attribute = result.release_value(); attributes.set(move(attribute.name), move(attribute.value)); } else { break; } } TRY(skip_whitespace()); TRY(expect(">"sv)); rollback.disarm(); return make(Node::Element { move(name), move(attributes), {} }); } // 3.1.42 ETag, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-ETag ErrorOr Parser::parse_end_tag() { auto rollback = rollback_point(); auto rule = enter_rule(); // ETag ::= '' TRY(expect(""sv)); rollback.disarm(); return name; } // 3.1.42 content, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-content ErrorOr Parser::parse_content() { auto rollback = rollback_point(); auto rule = enter_rule(); // content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)* if (auto result = parse_char_data(); !result.is_error()) append_text(result.release_value()); while (true) { if (auto result = parse_element(); !result.is_error()) goto try_char_data; if (auto result = parse_reference(); !result.is_error()) { auto reference = result.release_value(); if (auto char_reference = reference.get_pointer()) append_text(*char_reference); else TRY(resolve_reference(reference.get(), ReferencePlacement::Content)); goto try_char_data; } if (auto result = parse_cdata_section(); !result.is_error()) { if (m_options.preserve_cdata) append_text(result.release_value()); goto try_char_data; } if (auto result = parse_processing_instruction(); !result.is_error()) goto try_char_data; if (auto result = parse_comment(); !result.is_error()) goto try_char_data; break; try_char_data:; if (auto result = parse_char_data(); !result.is_error()) append_text(result.release_value()); } rollback.disarm(); return {}; } // 2.4.14 CharData, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CharData ErrorOr Parser::parse_char_data() { auto rollback = rollback_point(); auto rule = enter_rule(); // CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) auto cend_state = 0; // 1: ], 2: ], 3: > auto text = m_lexer.consume_while([&](auto ch) { if (ch == '<' || ch == '&' || cend_state == 3) return false; switch (cend_state) { case 0: case 1: if (ch == ']') cend_state++; else cend_state = 0; return true; case 2: if (ch == '>') { cend_state++; return true; } cend_state = 0; return true; default: VERIFY_NOT_REACHED(); } }); if (cend_state == 3) { m_lexer.retreat(3); text = text.substring_view(0, text.length() - 3); } rollback.disarm(); return text; } // 2.8.28b intSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-intSubset ErrorOr, ParseError> Parser::parse_internal_subset() { auto rollback = rollback_point(); auto rule = enter_rule(); Vector declarations; // intSubset ::= (markupdecl | DeclSep)* while (true) { if (auto result = parse_markup_declaration(); !result.is_error()) { auto maybe_declaration = result.release_value(); if (maybe_declaration.has_value()) declarations.append(maybe_declaration.release_value()); continue; } if (auto result = parse_declaration_separator(); !result.is_error()) { // The markup declarations may be made up in whole or in part of the replacement text of parameter entities. // The replacement text of a parameter entity reference in a DeclSep MUST match the production extSubsetDecl. auto maybe_replacement_text = result.release_value(); if (maybe_replacement_text.has_value()) { TemporaryChange source { m_source, maybe_replacement_text.value() }; TemporaryChange lexer { m_lexer, GenericLexer { m_source } }; auto contained_declarations = TRY(parse_external_subset_declaration()); declarations.extend(move(contained_declarations)); } continue; } break; } rollback.disarm(); return declarations; } // 2.8.29 markupdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-markupdecl ErrorOr, ParseError> Parser::parse_markup_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); // markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment if (auto result = parse_element_declaration(); !result.is_error()) { rollback.disarm(); return MarkupDeclaration { result.release_value() }; } if (auto result = parse_attribute_list_declaration(); !result.is_error()) { rollback.disarm(); return MarkupDeclaration { result.release_value() }; } if (auto result = parse_entity_declaration(); !result.is_error()) { rollback.disarm(); return MarkupDeclaration { result.release_value() }; } if (auto result = parse_notation_declaration(); !result.is_error()) { rollback.disarm(); return MarkupDeclaration { result.release_value() }; } if (auto result = parse_processing_instruction(); !result.is_error()) { rollback.disarm(); return Optional {}; } if (auto result = parse_comment(); !result.is_error()) { rollback.disarm(); return Optional {}; } return parse_error(m_lexer.tell(), "Expected one of elementdecl, attlistdecl, entitydecl, notationdecl, PI or comment"); } // 2.8.28a DeclSep, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-DeclSep ErrorOr, ParseError> Parser::parse_declaration_separator() { auto rollback = rollback_point(); auto rule = enter_rule(); // DeclSep ::= PEReference | S if (auto name = parse_parameter_entity_reference(); !name.is_error()) { rollback.disarm(); // FIXME: Resolve this PEReference. return ""; } if (auto result = skip_whitespace(Required::Yes); !result.is_error()) { rollback.disarm(); return Optional {}; } return parse_error(m_lexer.tell(), "Expected either whitespace, or a PEReference"); } // 4.1.69 PEReference, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PEReference ErrorOr Parser::parse_parameter_entity_reference() { auto rollback = rollback_point(); auto rule = enter_rule(); // PEReference ::= '%' Name ';' TRY(expect("%"sv)); auto accept = accept_rule(); auto name = TRY(parse_name()); TRY(expect(";"sv)); rollback.disarm(); return name; } // 3.2.46 elementdecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-elementdecl ErrorOr Parser::parse_element_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); // FIXME: Apparently both name _and_ contentspec here are allowed to be PEReferences, // but the grammar does not allow that, figure this out. // elementdecl ::= '' TRY(expect(""sv)); rollback.disarm(); return ElementDeclaration { move(name), move(spec), }; } // 3.3.52 AttlistDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttlistDecl ErrorOr Parser::parse_attribute_list_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); AttributeListDeclaration declaration; // AttlistDecl ::= '' TRY(expect(""sv)); rollback.disarm(); return declaration; } // 3.3.53 AttDef, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-AttDef ErrorOr Parser::parse_attribute_definition() { auto rollback = rollback_point(); auto rule = enter_rule(); Optional type; Optional default_; // AttDef ::= S Name S AttType S DefaultDecl TRY(skip_whitespace(Required::Yes)); auto name = TRY(parse_name()); auto accept = accept_rule(); TRY(skip_whitespace(Required::Yes)); // AttType ::= StringType | TokenizedType | EnumeratedType // StringType ::= 'CDATA' // TokenizedType ::= 'ID' // | 'IDREF' // | 'IDREFS' // | 'ENTITY' // | 'ENTITIES' // | 'NMTOKEN' // | 'NMTOKENS' // EnumeratedType ::= NotationType | Enumeration // NotationType ::= 'NOTATION' S '(' S? Name (S? '|' S? Name)* S? ')' // Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' if (m_lexer.consume_specific("CDATA")) { type = AttributeListDeclaration::StringType::CData; } else if (m_lexer.consume_specific("IDREFS")) { type = AttributeListDeclaration::TokenizedType::IDRefs; } else if (m_lexer.consume_specific("IDREF")) { type = AttributeListDeclaration::TokenizedType::IDRef; } else if (m_lexer.consume_specific("ID")) { type = AttributeListDeclaration::TokenizedType::ID; } else if (m_lexer.consume_specific("ENTITIES")) { type = AttributeListDeclaration::TokenizedType::Entities; } else if (m_lexer.consume_specific("ENTITY")) { type = AttributeListDeclaration::TokenizedType::Entity; } else if (m_lexer.consume_specific("NMTOKENS")) { type = AttributeListDeclaration::TokenizedType::NMTokens; } else if (m_lexer.consume_specific("NMTOKEN")) { type = AttributeListDeclaration::TokenizedType::NMToken; } else if (m_lexer.consume_specific("NOTATION")) { HashTable names; TRY(skip_whitespace(Required::Yes)); TRY(expect("("sv)); TRY(skip_whitespace()); names.set(TRY(parse_name())); while (true) { TRY(skip_whitespace()); if (auto result = expect("|"sv); result.is_error()) break; TRY(skip_whitespace()); names.set(TRY(parse_name())); } TRY(skip_whitespace()); TRY(expect(")"sv)); type = AttributeListDeclaration::NotationType { move(names) }; } else { HashTable names; TRY(expect("("sv)); TRY(skip_whitespace()); names.set(TRY(parse_nm_token())); while (true) { TRY(skip_whitespace()); if (auto result = expect("|"sv); result.is_error()) break; TRY(skip_whitespace()); names.set(TRY(parse_nm_token())); } TRY(skip_whitespace()); TRY(expect(")"sv)); type = AttributeListDeclaration::Enumeration { move(names) }; } TRY(skip_whitespace(Required::Yes)); // DefaultDecl ::= '#REQUIRED' | '#IMPLIED' // | (('#FIXED' S)? AttValue) if (m_lexer.consume_specific("#REQUIRED")) { default_ = AttributeListDeclaration::Required {}; } else if (m_lexer.consume_specific("#IMPLIED")) { default_ = AttributeListDeclaration::Implied {}; } else { bool fixed = false; if (m_lexer.consume_specific("#FIXED")) { TRY(skip_whitespace(Required::Yes)); fixed = true; } auto value = TRY(parse_attribute_value()); if (fixed) default_ = AttributeListDeclaration::Fixed { move(value) }; else default_ = AttributeListDeclaration::DefaultValue { move(value) }; } rollback.disarm(); return AttributeListDeclaration::Definition { move(name), type.release_value(), default_.release_value(), }; } // 2.3.7 Nmtoken, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-Nmtoken ErrorOr Parser::parse_nm_token() { auto rollback = rollback_point(); auto rule = enter_rule(); // Nmtoken ::= (NameChar)+ auto token = TRY(expect_many(s_name_characters, "a NameChar"sv)); rollback.disarm(); return token; } // 4.7.82 NotationDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#Notations ErrorOr Parser::parse_notation_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); Variant notation; // NotationDecl ::= '' TRY(expect(""sv)); rollback.disarm(); return NotationDeclaration { move(name), move(notation).downcast(), }; } // 3.2.46 contentspec, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-contentspec ErrorOr Parser::parse_content_spec() { auto rollback = rollback_point(); auto rule = enter_rule(); Optional content_spec; // contentspec ::= 'EMPTY' | 'ANY' | Mixed | children if (m_lexer.consume_specific("EMPTY")) { content_spec = ElementDeclaration::Empty {}; } else if (m_lexer.consume_specific("ANY")) { content_spec = ElementDeclaration::Any {}; } else { TRY(expect("("sv)); TRY(skip_whitespace()); if (m_lexer.consume_specific("#PCDATA")) { HashTable names; // Mixed ::= '(' S? '#PCDATA' (S? '|' S? Name)* S? ')*' // | '(' S? '#PCDATA' S? ')' TRY(skip_whitespace()); if (m_lexer.consume_specific(")*")) { content_spec = ElementDeclaration::Mixed { .types = {}, .many = true }; } else if (m_lexer.consume_specific(')')) { content_spec = ElementDeclaration::Mixed { .types = {}, .many = false }; } else { while (true) { TRY(skip_whitespace()); if (!m_lexer.consume_specific('|')) break; TRY(skip_whitespace()); if (auto result = parse_name(); !result.is_error()) names.set(result.release_value()); else return parse_error(m_lexer.tell(), "Expected a Name"); } TRY(skip_whitespace()); TRY(expect(")*"sv)); content_spec = ElementDeclaration::Mixed { .types = move(names), .many = true }; } } else { while (!m_lexer.next_is('(')) m_lexer.retreat(); // children ::= (choice | seq) ('?' | '*' | '+')? // cp ::= (Name | choice | seq) ('?' | '*' | '+')? // choice ::= '(' S? cp ( S? '|' S? cp )+ S? ')' // seq ::= '(' S? cp ( S? ',' S? cp )* S? ')' Function()> parse_choice; Function()> parse_sequence; auto parse_cp_init = [&]() -> ErrorOr, ParseError> { if (auto result = parse_name(); !result.is_error()) return result.release_value(); if (auto result = parse_choice(); !result.is_error()) return result.release_value(); return TRY(parse_sequence()); }; auto parse_qualifier = [&]() -> ElementDeclaration::Children::Qualifier { ElementDeclaration::Children::Qualifier qualifier { ElementDeclaration::Children::Qualifier::ExactlyOnce }; if (m_lexer.consume_specific('?')) qualifier = ElementDeclaration::Children::Qualifier::Optional; else if (m_lexer.consume_specific('*')) qualifier = ElementDeclaration::Children::Qualifier::Any; else if (m_lexer.consume_specific('+')) qualifier = ElementDeclaration::Children::Qualifier::OneOrMore; return qualifier; }; auto parse_cp = [&]() -> ErrorOr { auto sub_entry = TRY(parse_cp_init()); auto qualifier = parse_qualifier(); return ElementDeclaration::Children::Entry { move(sub_entry), qualifier, }; }; parse_choice = [&]() -> ErrorOr { auto rollback = rollback_point(); auto rule = enter_rule(); TRY(expect("("sv)); auto accept = accept_rule(); TRY(skip_whitespace()); Vector choices; choices.append(TRY(parse_cp())); while (true) { TRY(skip_whitespace()); if (!m_lexer.consume_specific('|')) break; TRY(skip_whitespace()); choices.append(TRY(parse_cp())); } TRY(expect(")"sv)); if (choices.size() < 2) return parse_error(m_lexer.tell(), "Expected more than one choice"); TRY(skip_whitespace()); auto qualifier = parse_qualifier(); rollback.disarm(); return ElementDeclaration::Children::Choice { move(choices), qualifier, }; }; parse_sequence = [&]() -> ErrorOr { auto rollback = rollback_point(); auto rule = enter_rule(); TRY(expect("("sv)); auto accept = accept_rule(); TRY(skip_whitespace()); Vector entries; entries.append(TRY(parse_cp())); while (true) { TRY(skip_whitespace()); if (!m_lexer.consume_specific(',')) break; TRY(skip_whitespace()); entries.append(TRY(parse_cp())); } TRY(expect(")"sv)); TRY(skip_whitespace()); auto qualifier = parse_qualifier(); rollback.disarm(); return ElementDeclaration::Children::Sequence { move(entries), qualifier, }; }; if (auto result = parse_choice(); !result.is_error()) { auto qualifier = parse_qualifier(); content_spec = ElementDeclaration::Children { result.release_value(), qualifier, }; } else { auto sequence = TRY(parse_sequence()); auto qualifier = parse_qualifier(); content_spec = ElementDeclaration::Children { move(sequence), qualifier, }; } } } rollback.disarm(); return content_spec.release_value(); } // 2.8.31 extSubsetDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubsetDecl ErrorOr, ParseError> Parser::parse_external_subset_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); Vector declarations; // extSubsetDecl ::= ( markupdecl | conditionalSect | DeclSep )* while (true) { if (auto result = parse_markup_declaration(); !result.is_error()) { if (result.value().has_value()) declarations.append(result.release_value().release_value()); continue; } // FIXME: conditionalSect if (auto result = parse_declaration_separator(); !result.is_error()) continue; break; } rollback.disarm(); return declarations; } // 4.2.70 EntityDecl, https://www.w3.org/TR/xml/#NT-EntityDecl ErrorOr Parser::parse_entity_declaration() { // EntityDecl ::= GEDecl | PEDecl if (auto result = parse_general_entity_declaration(); !result.is_error()) return result; return parse_parameter_entity_declaration(); } // 4.2.71 GEDecl, https://www.w3.org/TR/xml/#NT-GEDecl ErrorOr Parser::parse_general_entity_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); Variant definition; // GEDecl ::= '' TRY(expect(" notation; if (auto notation_result = parse_notation_data_declaration(); !notation_result.is_error()) notation = notation_result.release_value(); definition = EntityDefinition { move(external_id), move(notation), }; } TRY(skip_whitespace()); TRY(expect(">"sv)); rollback.disarm(); return GEDeclaration { move(name), move(definition).downcast(), }; } // 4.2.72 PEDecl, https://www.w3.org/TR/xml/#NT-PEDecl ErrorOr Parser::parse_parameter_entity_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); Variant definition; // PEDecl ::= '' TRY(expect(""sv)); rollback.disarm(); return PEDeclaration { move(name), move(definition).downcast(), }; } // 4.7.83 PublicID, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PublicID ErrorOr Parser::parse_public_id() { auto rollback = rollback_point(); auto rule = enter_rule(); // PublicID ::= 'PUBLIC' S PubidLiteral TRY(expect("PUBLIC"sv)); auto accept = accept_rule(); TRY(skip_whitespace(Required::Yes)); auto text = TRY(parse_public_id_literal()); rollback.disarm(); return PublicID { text, }; } constexpr static auto s_public_id_characters = set_to_search().unify(ranges_for_search()); // 2.3.12, PubidLiteral, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-PubidLiteral ErrorOr Parser::parse_public_id_literal() { auto rollback = rollback_point(); auto rule = enter_rule(); // PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); auto accept = accept_rule(); auto id = TRY(expect_many( [q = quote[0]](auto x) { return (q == '\'' ? x != '\'' : true) && s_public_id_characters.contains(x); }, "a PubidChar"sv)); TRY(expect(quote)); rollback.disarm(); return id; } // 2.3.11 SystemLiteral, https://www.w3.org/TR/xml/#NT-SystemLiteral ErrorOr Parser::parse_system_id_literal() { auto rollback = rollback_point(); auto rule = enter_rule(); // SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'") auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); auto accept = accept_rule(); auto id = TRY(expect_many(is_not_any_of(quote), "not a quote"sv)); TRY(expect(quote)); rollback.disarm(); return id; } // 4.2.75 ExternalID, https://www.w3.org/TR/xml/#NT-ExternalID ErrorOr Parser::parse_external_id() { auto rollback = rollback_point(); auto rule = enter_rule(); // ExternalID ::= 'SYSTEM' S SystemLiteral // | 'PUBLIC' S PubidLiteral S SystemLiteral Optional public_id; SystemID system_id; if (m_lexer.consume_specific("SYSTEM")) { auto accept = accept_rule(); TRY(skip_whitespace(Required::Yes)); system_id = SystemID { TRY(parse_system_id_literal()) }; } else { TRY(expect("PUBLIC"sv)); auto accept = accept_rule(); TRY(skip_whitespace(Required::Yes)); public_id = PublicID { TRY(parse_public_id_literal()) }; TRY(skip_whitespace(Required::Yes)); system_id = SystemID { TRY(parse_system_id_literal()) }; } rollback.disarm(); return ExternalID { move(public_id), move(system_id), }; } // 4.2.2.76 NDataDecl, https://www.w3.org/TR/xml/#NT-NDataDecl ErrorOr Parser::parse_notation_data_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); // NDataDecl ::= S 'NDATA' S Name TRY(skip_whitespace(Required::Yes)); auto accept = accept_rule(); TRY(expect("NDATA"sv)); TRY(skip_whitespace(Required::Yes)); auto name = TRY(parse_name()); rollback.disarm(); return name; } // 2.3.9 EntityValue, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-EntityValue ErrorOr Parser::parse_entity_value() { auto rollback = rollback_point(); auto rule = enter_rule(); StringBuilder builder; // EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' // | "'" ([^%&'] | PEReference | Reference)* "'" auto quote = TRY(expect(is_any_of("'\""sv), "any of ' or \""sv)); auto accept = accept_rule(); while (true) { if (m_lexer.is_eof()) break; if (m_lexer.next_is(quote)) break; if (m_lexer.next_is('%')) { auto start = m_lexer.tell(); TRY(parse_parameter_entity_reference()); builder.append(m_source.substring_view(start, m_lexer.tell() - start)); continue; } if (m_lexer.next_is('&')) { auto start = m_lexer.tell(); TRY(parse_reference()); builder.append(m_source.substring_view(start, m_lexer.tell() - start)); continue; } builder.append(m_lexer.consume()); } TRY(expect(quote)); rollback.disarm(); return builder.to_deprecated_string(); } // 2.7.18 CDSect, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-CDSect ErrorOr Parser::parse_cdata_section() { auto rollback = rollback_point(); auto rule = enter_rule(); // CDSect ::= CDStart CData CDEnd // CDStart ::= '' Char*)) // CDEnd ::= ']]>' TRY(expect("")) { if (m_lexer.is_eof()) break; m_lexer.ignore(); } auto section_end = m_lexer.tell(); TRY(expect("]]>"sv)); rollback.disarm(); return m_source.substring_view(section_start, section_end - section_start); } // 2.8.30 extSubset, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-extSubset ErrorOr, ParseError> Parser::parse_external_subset() { auto rollback = rollback_point(); auto rule = enter_rule(); // extSubset ::= TextDecl? extSubsetDecl (void)parse_text_declaration(); auto result = TRY(parse_external_subset_declaration()); rollback.disarm(); return result; } // 4.3.1.77 TextDecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-TextDecl ErrorOr Parser::parse_text_declaration() { auto rollback = rollback_point(); auto rule = enter_rule(); // TextDecl ::= '' TRY(expect(""sv)); rollback.disarm(); return {}; } ErrorOr Parser::resolve_reference(EntityReference const& reference, ReferencePlacement placement) { static HashTable reference_lookup {}; if (reference_lookup.contains(reference.name)) return parse_error(m_lexer.tell(), DeprecatedString::formatted("Invalid recursive definition for '{}'", reference.name)); reference_lookup.set(reference.name); ScopeGuard remove_lookup { [&] { reference_lookup.remove(reference.name); } }; Optional resolved; if (m_doctype.has_value()) { // FIXME: Split these up and resolve them ahead of time. for (auto& declaration : m_doctype->markup_declarations) { auto entity = declaration.get_pointer(); if (!entity) continue; auto ge_declaration = entity->get_pointer(); if (!ge_declaration) continue; if (ge_declaration->name != reference.name) continue; TRY(ge_declaration->definition.visit( [&](DeprecatedString const& definition) -> ErrorOr { resolved = definition; return {}; }, [&](EntityDefinition const& definition) -> ErrorOr { if (placement == ReferencePlacement::AttributeValue) return parse_error(m_lexer.tell(), DeprecatedString::formatted("Attribute references external entity '{}'", reference.name)); if (definition.notation.has_value()) return parse_error(0u, DeprecatedString::formatted("Entity reference to unparsed entity '{}'", reference.name)); if (!m_options.resolve_external_resource) return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}'", reference.name)); auto result = m_options.resolve_external_resource(definition.id.system_id, definition.id.public_id); if (result.is_error()) return parse_error(0u, DeprecatedString::formatted("Failed to resolve external entity '{}': {}", reference.name, result.error())); resolved = result.release_value(); return {}; })); break; } } if (!resolved.has_value()) { if (reference.name == "amp") return "&"; if (reference.name == "lt") return "<"; if (reference.name == "gt") return ">"; if (reference.name == "apos") return "'"; if (reference.name == "quot") return "\""; return parse_error(0u, DeprecatedString::formatted("Reference to undeclared entity '{}'", reference.name)); } StringView resolved_source = *resolved; TemporaryChange source { m_source, resolved_source }; TemporaryChange lexer { m_lexer, GenericLexer(m_source) }; switch (placement) { case ReferencePlacement::AttributeValue: return TRY(parse_attribute_value_inner(""sv)); case ReferencePlacement::Content: TRY(parse_content()); return ""; default: VERIFY_NOT_REACHED(); } } }