diff options
author | Andreas Kling <kling@serenityos.org> | 2020-05-28 12:35:19 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-05-28 12:35:19 +0200 |
commit | 5e53c45113b5902009ff2b6b87e717b39ff609b2 (patch) | |
tree | 2bb4b36a500472aab66cfed2917593ab424c215b | |
parent | 772b51038e54a7a2890dfc7920c2004088ecf1c4 (diff) | |
download | serenity-5e53c45113b5902009ff2b6b87e717b39ff609b2.zip |
LibWeb: Plumb content encoding into the new HTML parser
We still don't handle non-ASCII input correctly, but at least now we'll
convert e.g ISO-8859-1 to UTF-8 before starting to tokenize.
This patch also makes "view source" work with the new parser. :^)
-rw-r--r-- | Libraries/LibWeb/HtmlView.cpp | 2 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLDocumentParser.cpp | 7 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLDocumentParser.h | 2 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 8 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.h | 6 | ||||
-rw-r--r-- | Userland/ht.cpp | 2 |
6 files changed, 18 insertions, 9 deletions
diff --git a/Libraries/LibWeb/HtmlView.cpp b/Libraries/LibWeb/HtmlView.cpp index e4e5317897..2fc4b8f048 100644 --- a/Libraries/LibWeb/HtmlView.cpp +++ b/Libraries/LibWeb/HtmlView.cpp @@ -444,7 +444,7 @@ RefPtr<Document> HtmlView::create_document_from_mime_type(const ByteBuffer& data return create_gemini_document(data, url); if (mime_type == "text/html") { if (m_use_new_parser) { - HTMLDocumentParser parser(data); + HTMLDocumentParser parser(data, encoding); parser.run(url); return parser.document(); } diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp index e735d876a5..03964f01ec 100644 --- a/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.cpp @@ -24,7 +24,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#define PARSER_DEBUG +//#define PARSER_DEBUG #include <AK/Utf32View.h> #include <LibWeb/DOM/Comment.h> @@ -51,8 +51,8 @@ namespace Web { -HTMLDocumentParser::HTMLDocumentParser(const StringView& input) - : m_tokenizer(input) +HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding) + : m_tokenizer(input, encoding) { } @@ -64,6 +64,7 @@ void HTMLDocumentParser::run(const URL& url) { m_document = adopt(*new Document); m_document->set_url(url); + m_document->set_source(m_tokenizer.source()); for (;;) { auto optional_token = m_tokenizer.next_token(); diff --git a/Libraries/LibWeb/Parser/HTMLDocumentParser.h b/Libraries/LibWeb/Parser/HTMLDocumentParser.h index 27efc95885..de4c817efb 100644 --- a/Libraries/LibWeb/Parser/HTMLDocumentParser.h +++ b/Libraries/LibWeb/Parser/HTMLDocumentParser.h @@ -61,7 +61,7 @@ namespace Web { class HTMLDocumentParser { public: - explicit HTMLDocumentParser(const StringView& input); + HTMLDocumentParser(const StringView& input, const String& encoding); ~HTMLDocumentParser(); void run(const URL&); diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index 024d014a9b..d5b96b217e 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -24,6 +24,7 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <LibTextCodec/Decoder.h> #include <LibWeb/Parser/Entities.h> #include <LibWeb/Parser/HTMLToken.h> #include <LibWeb/Parser/HTMLTokenizer.h> @@ -1711,9 +1712,12 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type) m_current_token.m_type = type; } -HTMLTokenizer::HTMLTokenizer(const StringView& input) - : m_input(input) +HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) { + auto* decoder = TextCodec::decoder_for(encoding); + ASSERT(decoder); + m_decoded_input = decoder->to_utf8(input); + m_input = m_decoded_input; } void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index 42efde3400..e21bbfdcf2 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -118,7 +118,7 @@ namespace Web { class HTMLTokenizer { public: - explicit HTMLTokenizer(const StringView& input); + explicit HTMLTokenizer(const StringView& input, const String& encoding); enum class State { #define __ENUMERATE_TOKENIZER_STATE(state) state, @@ -133,6 +133,8 @@ public: void set_blocked(bool b) { m_blocked = b; } bool is_blocked() const { return m_blocked; } + String source() const { return m_decoded_input; } + private: Optional<u32> next_codepoint(); Optional<u32> peek_codepoint(size_t offset) const; @@ -163,6 +165,8 @@ private: Vector<u32> m_temporary_buffer; + String m_decoded_input; + StringView m_input; size_t m_cursor { 0 }; diff --git a/Userland/ht.cpp b/Userland/ht.cpp index b3354c2a7f..df52f6e59c 100644 --- a/Userland/ht.cpp +++ b/Userland/ht.cpp @@ -47,7 +47,7 @@ int main(int argc, char** argv) return 1; auto contents = file_or_error.value()->read_all(); - Web::HTMLDocumentParser parser(contents); + Web::HTMLDocumentParser parser(contents, "utf-8"); parser.run(URL::create_with_file_protocol(input_path)); auto& document = parser.document(); |