diff options
author | Andreas Kling <kling@serenityos.org> | 2020-06-04 22:02:54 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-06-04 22:09:36 +0200 |
commit | b59f4632d5b77d2ef4502fb6bbd4d2c4a2fa03f5 (patch) | |
tree | 58df878d8f82775777adf53115d9104eff013495 /Libraries | |
parent | b6288163f1384d723cfbb92576881c079c274562 (diff) | |
download | serenity-b59f4632d5b77d2ef4502fb6bbd4d2c4a2fa03f5.zip |
LibWeb: Unbreak character reference and DOCTYPE parsing post-UTF-8
Oops, these were still using the byte-offset cursor. My goodness is it
unergonomic to index into UTF-8 strings, but Dr. Bugaev says it's good.
There is lots of room for improvement here. Just like the rest of the
tokenizer and parser. We'll have to do a few optimization passes over
them once they mature.
Diffstat (limited to 'Libraries')
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 35 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.h | 1 |
2 files changed, 24 insertions, 12 deletions
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index f1934e10d3..cdf2e61a70 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -157,12 +157,12 @@ return m_queued_tokens.dequeue(); \ } while (0) -#define EMIT_CHARACTER(codepoint) \ - do { \ - create_new_token(HTMLToken::Type::Character); \ +#define EMIT_CHARACTER(codepoint) \ + do { \ + create_new_token(HTMLToken::Type::Character); \ m_current_token.m_comment_or_character.data.append_codepoint(codepoint); \ - m_queued_tokens.enqueue(m_current_token); \ - return m_queued_tokens.dequeue(); \ + m_queued_tokens.enqueue(m_current_token); \ + return m_queued_tokens.dequeue(); \ } while (0) #define EMIT_CURRENT_CHARACTER \ @@ -209,14 +209,20 @@ Optional<u32> HTMLTokenizer::next_codepoint() return {}; m_prev_utf8_iterator = m_utf8_iterator; ++m_utf8_iterator; +#ifdef TOKENIZER_TRACE + dbg() << "(Tokenizer) Next codepoint: " << (char)*m_prev_utf8_iterator; +#endif return *m_prev_utf8_iterator; } Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const { - if ((m_cursor + offset) >= m_input.length()) + auto it = m_utf8_iterator; + for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i) + ++it; + if (it == m_utf8_view.end()) return {}; - return m_input[m_cursor + offset]; + return *it; } Optional<HTMLToken> HTMLTokenizer::next_token() @@ -1281,10 +1287,15 @@ _StartOfFunction: BEGIN_STATE(NamedCharacterReference) { - auto match = HTML::codepoints_from_entity(m_input.substring_view(m_cursor - 1, m_input.length() - m_cursor + 1)); + size_t byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator); + + auto match = HTML::codepoints_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1)); if (match.has_value()) { - m_cursor += match.value().entity.length(); + for (size_t i = 0; i < match.value().entity.length(); ++i) { + m_prev_utf8_iterator = m_utf8_iterator; + ++m_utf8_iterator; + } for (auto ch : match.value().entity) m_temporary_buffer.append(ch); @@ -2078,7 +2089,10 @@ bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitiv if (codepoint.value() != (u32)string[i]) return false; } - m_cursor += string.length(); + for (size_t i = 0; i < string.length(); ++i) { + m_prev_utf8_iterator = m_utf8_iterator; + ++m_utf8_iterator; + } return true; } @@ -2093,7 +2107,6 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) auto* decoder = TextCodec::decoder_for(encoding); ASSERT(decoder); m_decoded_input = decoder->to_utf8(input); - m_input = m_decoded_input; m_utf8_view = Utf8View(m_decoded_input); m_utf8_iterator = m_utf8_view.begin(); } diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index a5550e0fcd..af714dbde6 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -169,7 +169,6 @@ private: String m_decoded_input; StringView m_input; - size_t m_cursor { 0 }; Utf8View m_utf8_view; AK::Utf8CodepointIterator m_utf8_iterator; |