LibWeb: Add extracting character encoding from a meta content attribute

Some Gmail emails contain this.
author: Luke <luke.wilde@live.co.uk> 2021-07-13 18:37:03 +0100
committer: Andreas Kling <kling@serenityos.org> 2021-07-13 20:23:44 +0200
commit: e9eae9d8801c603cbd3cbd8accb6d436dd004746 (patch)
tree: 4ebf49b284857e7018071488e2dcf0f63afbbef3 /Userland
parent: b919789db24cbeb323a694989ca784c43fff9acc (diff)
download: serenity-e9eae9d8801c603cbd3cbd8accb6d436dd004746.zip
2 files changed, 75 insertions, 9 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
index 1afe85ba05..70a1fbab03 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
@@ -4,6 +4,8 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/CharacterTypes.h>
+#include <AK/GenericLexer.h>
 #include <AK/StringView.h>
 #include <AK/Utf8View.h>
 #include <LibTextCodec/Decoder.h>
@@ -29,6 +31,69 @@ bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& positi
     return !prescan_should_abort(input, position);
 }
 
+// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
+Optional<String> extract_character_encoding_from_meta_element(String const& string)
+{
+    // Checking for "charset" is case insensitive, as is getting an encoding.
+    // Therefore, stick to lowercase from the start for simplicity.
+    auto lowercase_string = string.to_lowercase();
+    GenericLexer lexer(lowercase_string);
+
+    for (;;) {
+        auto charset_index = lexer.remaining().find("charset");
+        if (!charset_index.has_value())
+            return {};
+
+        // 7 is the length of "charset".
+        lexer.ignore(charset_index.value() + 7);
+
+        lexer.ignore_while([](char c) {
+            // FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
+            return is_ascii_space(c);
+        });
+
+        if (lexer.peek() != '=')
+            continue;
+
+        break;
+    }
+
+    // Ignore the '='.
+    lexer.ignore();
+
+    lexer.ignore_while([](char c) {
+        // FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
+        return is_ascii_space(c);
+    });
+
+    if (lexer.is_eof())
+        return {};
+
+    if (lexer.consume_specific('"')) {
+        auto matching_double_quote = lexer.remaining().find("\"");
+        if (!matching_double_quote.has_value())
+            return {};
+
+        auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
+        return TextCodec::get_standardized_encoding(encoding);
+    }
+
+    if (lexer.consume_specific('\'')) {
+        auto matching_single_quote = lexer.remaining().find("'");
+        if (!matching_single_quote.has_value())
+            return {};
+
+        auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
+        return TextCodec::get_standardized_encoding(encoding);
+    }
+
+    auto encoding = lexer.consume_until([](char c) {
+        // FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
+        return is_ascii_space(c) || c == ';';
+    });
+    return TextCodec::get_standardized_encoding(encoding);
+}
+
 Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
 {
     if (!prescan_skip_whitespace_and_slashes(input, position))
@@ -137,21 +202,21 @@ Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
                 auto& attribute_name = attribute.value().name();
                 attribute_list.append(attribute.value().name());
 
-                if (attribute_name == "http-equiv" && attribute.value().value() == "content-type")
-                    got_pragma = true;
-                else if (attribute_name == "charset") {
+                if (attribute_name == "http-equiv") {
+                    got_pragma = attribute.value().value() == "content-type";
+                } else if (attribute_name == "content") {
+                    auto encoding = extract_character_encoding_from_meta_element(attribute.value().value());
+                    if (encoding.has_value() && !charset.has_value()) {
+                        charset = encoding.value();
+                        need_pragma = true;
+                    }
+                } else if (attribute_name == "charset") {
                     auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
                     if (maybe_charset.has_value()) {
                         charset = Optional<String> { maybe_charset };
                         need_pragma = { false };
                     }
                 }
-
-                // FIXME: For attribute name "content", do this:
-                //        Apply the "algorithm for extracting a character encoding from a meta
-                //        element", giving the attribute's value as the string to parse. If a
-                //        character encoding is returned, and if charset is still set to null,
-                //        let charset be the encoding returned, and set need pragma to true.
             }
 
             if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
index b81d1f365c..eff8190d95 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
@@ -15,6 +15,7 @@ namespace Web::HTML {
 bool prescan_should_abort(const ByteBuffer& input, const size_t& position);
 bool prescan_is_whitespace_or_slash(const u8& byte);
 bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position);
+Optional<String> extract_character_encoding_from_meta_element(String const&);
 Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position);
 Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input);
 String run_encoding_sniffing_algorithm(const ByteBuffer& input);
author	Luke <luke.wilde@live.co.uk>	2021-07-13 18:37:03 +0100
committer	Andreas Kling <kling@serenityos.org>	2021-07-13 20:23:44 +0200
commit	e9eae9d8801c603cbd3cbd8accb6d436dd004746 (patch)
tree	4ebf49b284857e7018071488e2dcf0f63afbbef3 /Userland
parent	b919789db24cbeb323a694989ca784c43fff9acc (diff)
download	serenity-e9eae9d8801c603cbd3cbd8accb6d436dd004746.zip