summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibWeb/HTML/Parser
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-10-15 09:57:07 -0400
committerLinus Groh <mail@linusgroh.de>2021-10-17 13:51:10 +0100
commite01dfaac9ad06016aa935b70e998987b65700eec (patch)
tree343f2a6b01d713c3196227bcb7d485aadfa07fa4 /Userland/Libraries/LibWeb/HTML/Parser
parent8d27292fac28af04587be5e5330db525ef89816b (diff)
downloadserenity-e01dfaac9ad06016aa935b70e998987b65700eec.zip
LibWeb: Implement Attribute closer to the spec and with an IDL file
Note our Attribute class is what the spec refers to as just "Attr". The main differences between the existing implementation and the spec are just that the spec defines more fields. Attributes can contain namespace URIs and prefixes. However, note that these are not parsed in HTML documents unless the document content-type is XML. So for now, these are initialized to null. Web pages are able to set the namespace via JavaScript (setAttributeNS), so these fields may be filled in when the corresponding APIs are implemented. The main change to be aware of is that an attribute is a node. This has implications on how attributes are stored in the Element class. Nodes are non-copyable and non-movable because these constructors are deleted by the EventTarget base class. This means attributes cannot be stored in a Vector or HashMap as these containers assume copyability / movability. So for now, the Vector holding attributes is changed to hold RefPtrs to attributes instead. This might change when attribute storage is implemented according to the spec (by way of NamedNodeMap).
Diffstat (limited to 'Userland/Libraries/LibWeb/HTML/Parser')
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp38
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h8
-rw-r--r--Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp4
3 files changed, 26 insertions, 24 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
index 460e0a3872..f0c5c524b1 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
@@ -9,6 +9,8 @@
#include <AK/StringView.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
+#include <LibWeb/DOM/Attribute.h>
+#include <LibWeb/DOM/Document.h>
#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
#include <ctype.h>
@@ -94,7 +96,7 @@ Optional<String> extract_character_encoding_from_meta_element(String const& stri
return TextCodec::get_standardized_encoding(encoding);
}
-Optional<DOM::Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
+RefPtr<DOM::Attribute> prescan_get_attribute(DOM::Document& document, const ByteBuffer& input, size_t& position)
{
if (!prescan_skip_whitespace_and_slashes(input, position))
return {};
@@ -109,7 +111,7 @@ Optional<DOM::Attribute> prescan_get_attribute(const ByteBuffer& input, size_t&
} else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
goto spaces;
else if (input[position] == '/' || input[position] == '>')
- return DOM::Attribute(attribute_name.to_string(), "");
+ return DOM::Attribute::create(document, attribute_name.to_string(), "");
else
attribute_name.append_as_lowercase(input[position]);
++position;
@@ -121,7 +123,7 @@ spaces:
if (!prescan_skip_whitespace_and_slashes(input, position))
return {};
if (input[position] != '=')
- return DOM::Attribute(attribute_name.to_string(), "");
+ return DOM::Attribute::create(document, attribute_name.to_string(), "");
++position;
value:
@@ -134,13 +136,13 @@ value:
++position;
for (; !prescan_should_abort(input, position); ++position) {
if (input[position] == quote_character)
- return DOM::Attribute(attribute_name.to_string(), attribute_value.to_string());
+ return DOM::Attribute::create(document, attribute_name.to_string(), attribute_value.to_string());
else
attribute_value.append_as_lowercase(input[position]);
}
return {};
} else if (input[position] == '>')
- return DOM::Attribute(attribute_name.to_string(), "");
+ return DOM::Attribute::create(document, attribute_name.to_string(), "");
else
attribute_value.append_as_lowercase(input[position]);
@@ -150,7 +152,7 @@ value:
for (; !prescan_should_abort(input, position); ++position) {
if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
- return DOM::Attribute(attribute_name.to_string(), attribute_value.to_string());
+ return DOM::Attribute::create(document, attribute_name.to_string(), attribute_value.to_string());
else
attribute_value.append_as_lowercase(input[position]);
}
@@ -158,7 +160,7 @@ value:
}
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
-Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
+Optional<String> run_prescan_byte_stream_algorithm(DOM::Document& document, const ByteBuffer& input)
{
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
@@ -194,24 +196,24 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
Optional<String> charset {};
while (true) {
- auto attribute = prescan_get_attribute(input, position);
- if (!attribute.has_value())
+ auto attribute = prescan_get_attribute(document, input, position);
+ if (!attribute)
break;
- if (attribute_list.contains_slow(attribute.value().name()))
+ if (attribute_list.contains_slow(attribute->name()))
continue;
- auto& attribute_name = attribute.value().name();
- attribute_list.append(attribute.value().name());
+ auto& attribute_name = attribute->name();
+ attribute_list.append(attribute->name());
if (attribute_name == "http-equiv") {
- got_pragma = attribute.value().value() == "content-type";
+ got_pragma = attribute->value() == "content-type";
} else if (attribute_name == "content") {
- auto encoding = extract_character_encoding_from_meta_element(attribute.value().value());
+ auto encoding = extract_character_encoding_from_meta_element(attribute->value());
if (encoding.has_value() && !charset.has_value()) {
charset = encoding.value();
need_pragma = true;
}
} else if (attribute_name == "charset") {
- auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
+ auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value());
if (maybe_charset.has_value()) {
charset = Optional<String> { maybe_charset };
need_pragma = { false };
@@ -231,7 +233,7 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
&& ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
position += 2;
prescan_skip_whitespace_and_slashes(input, position);
- while (prescan_get_attribute(input, position).has_value()) { };
+ while (prescan_get_attribute(document, input, position)) { };
} else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
position += 2;
while (input[position] != '>') {
@@ -247,7 +249,7 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
}
// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
-String run_encoding_sniffing_algorithm(const ByteBuffer& input)
+String run_encoding_sniffing_algorithm(DOM::Document& document, const ByteBuffer& input)
{
if (input.size() >= 2) {
if (input[0] == 0xFE && input[1] == 0xFF) {
@@ -265,7 +267,7 @@ String run_encoding_sniffing_algorithm(const ByteBuffer& input)
// at any later step in this algorithm.
// FIXME: If the transport layer specifies a character encoding, and it is supported.
- auto optional_encoding = run_prescan_byte_stream_algorithm(input);
+ auto optional_encoding = run_prescan_byte_stream_algorithm(document, input);
if (optional_encoding.has_value()) {
return optional_encoding.value();
}
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
index 4d9a1e9ab2..52784e8d7c 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
@@ -8,7 +8,7 @@
#include <AK/Optional.h>
#include <AK/String.h>
-#include <LibWeb/DOM/Attribute.h>
+#include <LibWeb/Forward.h>
namespace Web::HTML {
@@ -16,8 +16,8 @@ bool prescan_should_abort(const ByteBuffer& input, const size_t& position);
bool prescan_is_whitespace_or_slash(const u8& byte);
bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position);
Optional<String> extract_character_encoding_from_meta_element(String const&);
-Optional<DOM::Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position);
-Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input);
-String run_encoding_sniffing_algorithm(const ByteBuffer& input);
+RefPtr<DOM::Attribute> prescan_get_attribute(DOM::Document&, const ByteBuffer& input, size_t& position);
+Optional<String> run_prescan_byte_stream_algorithm(DOM::Document&, const ByteBuffer& input);
+String run_encoding_sniffing_algorithm(DOM::Document&, const ByteBuffer& input);
}
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp
index d9fc29f64b..1a7ada5a10 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp
@@ -2810,7 +2810,7 @@ void HTMLParser::handle_in_frameset(HTMLToken& token)
}
if (token.is_end_of_file()) {
- //FIXME: If the current node is not the root html element, then this is a parse error.
+ // FIXME: If the current node is not the root html element, then this is a parse error.
stop_parsing();
return;
@@ -3162,7 +3162,7 @@ NonnullOwnPtr<HTMLParser> HTMLParser::create_with_uncertain_encoding(DOM::Docume
{
if (document.has_encoding())
return make<HTMLParser>(document, input, document.encoding().value());
- auto encoding = run_encoding_sniffing_algorithm(input);
+ auto encoding = run_encoding_sniffing_algorithm(document, input);
dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding);
return make<HTMLParser>(document, input, encoding);
}