diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-10-15 09:57:07 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-10-17 13:51:10 +0100 |
commit | e01dfaac9ad06016aa935b70e998987b65700eec (patch) | |
tree | 343f2a6b01d713c3196227bcb7d485aadfa07fa4 /Userland/Libraries/LibWeb/HTML/Parser | |
parent | 8d27292fac28af04587be5e5330db525ef89816b (diff) | |
download | serenity-e01dfaac9ad06016aa935b70e998987b65700eec.zip |
LibWeb: Implement Attribute closer to the spec and with an IDL file
Note our Attribute class is what the spec refers to as just "Attr". The
main differences between the existing implementation and the spec are
just that the spec defines more fields.
Attributes can contain namespace URIs and prefixes. However, note that
these are not parsed in HTML documents unless the document content-type
is XML. So for now, these are initialized to null. Web pages are able to
set the namespace via JavaScript (setAttributeNS), so these fields may
be filled in when the corresponding APIs are implemented.
The main change to be aware of is that an attribute is a node. This has
implications on how attributes are stored in the Element class. Nodes
are non-copyable and non-movable because these constructors are deleted
by the EventTarget base class. This means attributes cannot be stored in
a Vector or HashMap as these containers assume copyability / movability.
So for now, the Vector holding attributes is changed to hold RefPtrs to
attributes instead. This might change when attribute storage is
implemented according to the spec (by way of NamedNodeMap).
Diffstat (limited to 'Userland/Libraries/LibWeb/HTML/Parser')
3 files changed, 26 insertions, 24 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp index 460e0a3872..f0c5c524b1 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp @@ -9,6 +9,8 @@ #include <AK/StringView.h> #include <AK/Utf8View.h> #include <LibTextCodec/Decoder.h> +#include <LibWeb/DOM/Attribute.h> +#include <LibWeb/DOM/Document.h> #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h> #include <ctype.h> @@ -94,7 +96,7 @@ Optional<String> extract_character_encoding_from_meta_element(String const& stri return TextCodec::get_standardized_encoding(encoding); } -Optional<DOM::Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position) +RefPtr<DOM::Attribute> prescan_get_attribute(DOM::Document& document, const ByteBuffer& input, size_t& position) { if (!prescan_skip_whitespace_and_slashes(input, position)) return {}; @@ -109,7 +111,7 @@ Optional<DOM::Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ') goto spaces; else if (input[position] == '/' || input[position] == '>') - return DOM::Attribute(attribute_name.to_string(), ""); + return DOM::Attribute::create(document, attribute_name.to_string(), ""); else attribute_name.append_as_lowercase(input[position]); ++position; @@ -121,7 +123,7 @@ spaces: if (!prescan_skip_whitespace_and_slashes(input, position)) return {}; if (input[position] != '=') - return DOM::Attribute(attribute_name.to_string(), ""); + return DOM::Attribute::create(document, attribute_name.to_string(), ""); ++position; value: @@ -134,13 +136,13 @@ value: ++position; for (; !prescan_should_abort(input, position); ++position) { if (input[position] == quote_character) - return DOM::Attribute(attribute_name.to_string(), attribute_value.to_string()); + return DOM::Attribute::create(document, attribute_name.to_string(), attribute_value.to_string()); else attribute_value.append_as_lowercase(input[position]); } return {}; } else if (input[position] == '>') - return DOM::Attribute(attribute_name.to_string(), ""); + return DOM::Attribute::create(document, attribute_name.to_string(), ""); else attribute_value.append_as_lowercase(input[position]); @@ -150,7 +152,7 @@ value: for (; !prescan_should_abort(input, position); ++position) { if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>') - return DOM::Attribute(attribute_name.to_string(), attribute_value.to_string()); + return DOM::Attribute::create(document, attribute_name.to_string(), attribute_value.to_string()); else attribute_value.append_as_lowercase(input[position]); } @@ -158,7 +160,7 @@ value: } // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding -Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input) +Optional<String> run_prescan_byte_stream_algorithm(DOM::Document& document, const ByteBuffer& input) { // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding @@ -194,24 +196,24 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input) Optional<String> charset {}; while (true) { - auto attribute = prescan_get_attribute(input, position); - if (!attribute.has_value()) + auto attribute = prescan_get_attribute(document, input, position); + if (!attribute) break; - if (attribute_list.contains_slow(attribute.value().name())) + if (attribute_list.contains_slow(attribute->name())) continue; - auto& attribute_name = attribute.value().name(); - attribute_list.append(attribute.value().name()); + auto& attribute_name = attribute->name(); + attribute_list.append(attribute->name()); if (attribute_name == "http-equiv") { - got_pragma = attribute.value().value() == "content-type"; + got_pragma = attribute->value() == "content-type"; } else if (attribute_name == "content") { - auto encoding = extract_character_encoding_from_meta_element(attribute.value().value()); + auto encoding = extract_character_encoding_from_meta_element(attribute->value()); if (encoding.has_value() && !charset.has_value()) { charset = encoding.value(); need_pragma = true; } } else if (attribute_name == "charset") { - auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value()); + auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value()); if (maybe_charset.has_value()) { charset = Optional<String> { maybe_charset }; need_pragma = { false }; @@ -231,7 +233,7 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input) && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) { position += 2; prescan_skip_whitespace_and_slashes(input, position); - while (prescan_get_attribute(input, position).has_value()) { }; + while (prescan_get_attribute(document, input, position)) { }; } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) { position += 2; while (input[position] != '>') { @@ -247,7 +249,7 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input) } // https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding -String run_encoding_sniffing_algorithm(const ByteBuffer& input) +String run_encoding_sniffing_algorithm(DOM::Document& document, const ByteBuffer& input) { if (input.size() >= 2) { if (input[0] == 0xFE && input[1] == 0xFF) { @@ -265,7 +267,7 @@ String run_encoding_sniffing_algorithm(const ByteBuffer& input) // at any later step in this algorithm. // FIXME: If the transport layer specifies a character encoding, and it is supported. - auto optional_encoding = run_prescan_byte_stream_algorithm(input); + auto optional_encoding = run_prescan_byte_stream_algorithm(document, input); if (optional_encoding.has_value()) { return optional_encoding.value(); } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h index 4d9a1e9ab2..52784e8d7c 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h @@ -8,7 +8,7 @@ #include <AK/Optional.h> #include <AK/String.h> -#include <LibWeb/DOM/Attribute.h> +#include <LibWeb/Forward.h> namespace Web::HTML { @@ -16,8 +16,8 @@ bool prescan_should_abort(const ByteBuffer& input, const size_t& position); bool prescan_is_whitespace_or_slash(const u8& byte); bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position); Optional<String> extract_character_encoding_from_meta_element(String const&); -Optional<DOM::Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position); -Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input); -String run_encoding_sniffing_algorithm(const ByteBuffer& input); +RefPtr<DOM::Attribute> prescan_get_attribute(DOM::Document&, const ByteBuffer& input, size_t& position); +Optional<String> run_prescan_byte_stream_algorithm(DOM::Document&, const ByteBuffer& input); +String run_encoding_sniffing_algorithm(DOM::Document&, const ByteBuffer& input); } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp index d9fc29f64b..1a7ada5a10 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp @@ -2810,7 +2810,7 @@ void HTMLParser::handle_in_frameset(HTMLToken& token) } if (token.is_end_of_file()) { - //FIXME: If the current node is not the root html element, then this is a parse error. + // FIXME: If the current node is not the root html element, then this is a parse error. stop_parsing(); return; @@ -3162,7 +3162,7 @@ NonnullOwnPtr<HTMLParser> HTMLParser::create_with_uncertain_encoding(DOM::Docume { if (document.has_encoding()) return make<HTMLParser>(document, input, document.encoding().value()); - auto encoding = run_encoding_sniffing_algorithm(input); + auto encoding = run_encoding_sniffing_algorithm(document, input); dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding); return make<HTMLParser>(document, input, encoding); } |