LibWeb: Plumb content encoding into the new HTML parser

We still don't handle non-ASCII input correctly, but at least now we'll convert e.g ISO-8859-1 to UTF-8 before starting to tokenize. This patch also makes "view source" work with the new parser. :^)
author: Andreas Kling <kling@serenityos.org> 2020-05-28 12:35:19 +0200
committer: Andreas Kling <kling@serenityos.org> 2020-05-28 12:35:19 +0200
commit: 5e53c45113b5902009ff2b6b87e717b39ff609b2 (patch)
tree: 2bb4b36a500472aab66cfed2917593ab424c215b
parent: 772b51038e54a7a2890dfc7920c2004088ecf1c4 (diff)
download: serenity-5e53c45113b5902009ff2b6b87e717b39ff609b2.zip
6 files changed, 18 insertions, 9 deletions
diff --git a/Libraries/LibWeb/HtmlView.cpp b/Libraries/LibWeb/HtmlView.cpp
index e4e5317897..2fc4b8f048 100644
--- a/Libraries/LibWeb/HtmlView.cpp
+++ b/Libraries/LibWeb/HtmlView.cpp
@@ -444,7 +444,7 @@ RefPtr<Document> HtmlView::create_document_from_mime_type(const ByteBuffer& data
         return create_gemini_document(data, url);
     if (mime_type == "text/html") {
         if (m_use_new_parser) {
-            HTMLDocumentParser parser(data);
+            HTMLDocumentParser parser(data, encoding);
             parser.run(url);
             return parser.document();
         }
diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
index e735d876a5..03964f01ec 100644
--- a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp
@@ -24,7 +24,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#define PARSER_DEBUG
+//#define PARSER_DEBUG
 
 #include <AK/Utf32View.h>
 #include <LibWeb/DOM/Comment.h>
@@ -51,8 +51,8 @@
 
 namespace Web {
 
-HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
-    : m_tokenizer(input)
+HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding)
+    : m_tokenizer(input, encoding)
 {
 }
 
@@ -64,6 +64,7 @@ void HTMLDocumentParser::run(const URL& url)
 {
     m_document = adopt(*new Document);
     m_document->set_url(url);
+    m_document->set_source(m_tokenizer.source());
 
     for (;;) {
         auto optional_token = m_tokenizer.next_token();
diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
index 27efc95885..de4c817efb 100644
--- a/Libraries/LibWeb/Parser/HTMLDocumentParser.h
+++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h
@@ -61,7 +61,7 @@ namespace Web {
 
 class HTMLDocumentParser {
 public:
-    explicit HTMLDocumentParser(const StringView& input);
+    HTMLDocumentParser(const StringView& input, const String& encoding);
     ~HTMLDocumentParser();
 
     void run(const URL&);
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
index 024d014a9b..d5b96b217e 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@@ -24,6 +24,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <LibTextCodec/Decoder.h>
 #include <LibWeb/Parser/Entities.h>
 #include <LibWeb/Parser/HTMLToken.h>
 #include <LibWeb/Parser/HTMLTokenizer.h>
@@ -1711,9 +1712,12 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
     m_current_token.m_type = type;
 }
 
-HTMLTokenizer::HTMLTokenizer(const StringView& input)
-    : m_input(input)
+HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
 {
+    auto* decoder = TextCodec::decoder_for(encoding);
+    ASSERT(decoder);
+    m_decoded_input = decoder->to_utf8(input);
+    m_input = m_decoded_input;
 }
 
 void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h
index 42efde3400..e21bbfdcf2 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.h
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h
@@ -118,7 +118,7 @@ namespace Web {
 
 class HTMLTokenizer {
 public:
-    explicit HTMLTokenizer(const StringView& input);
+    explicit HTMLTokenizer(const StringView& input, const String& encoding);
 
     enum class State {
 #define __ENUMERATE_TOKENIZER_STATE(state) state,
@@ -133,6 +133,8 @@ public:
     void set_blocked(bool b) { m_blocked = b; }
     bool is_blocked() const { return m_blocked; }
 
+    String source() const { return m_decoded_input; }
+
 private:
     Optional<u32> next_codepoint();
     Optional<u32> peek_codepoint(size_t offset) const;
@@ -163,6 +165,8 @@ private:
 
     Vector<u32> m_temporary_buffer;
 
+    String m_decoded_input;
+
     StringView m_input;
     size_t m_cursor { 0 };
 
diff --git a/Userland/ht.cpp b/Userland/ht.cpp
index b3354c2a7f..df52f6e59c 100644
--- a/Userland/ht.cpp
+++ b/Userland/ht.cpp
@@ -47,7 +47,7 @@ int main(int argc, char** argv)
         return 1;
     auto contents = file_or_error.value()->read_all();
 
-    Web::HTMLDocumentParser parser(contents);
+    Web::HTMLDocumentParser parser(contents, "utf-8");
     parser.run(URL::create_with_file_protocol(input_path));
 
     auto& document = parser.document();
author	Andreas Kling <kling@serenityos.org>	2020-05-28 12:35:19 +0200
committer	Andreas Kling <kling@serenityos.org>	2020-05-28 12:35:19 +0200
commit	5e53c45113b5902009ff2b6b87e717b39ff609b2 (patch)
tree	2bb4b36a500472aab66cfed2917593ab424c215b
parent	772b51038e54a7a2890dfc7920c2004088ecf1c4 (diff)
download	serenity-5e53c45113b5902009ff2b6b87e717b39ff609b2.zip