diff options
author | Andreas Kling <kling@serenityos.org> | 2020-05-23 18:43:09 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2020-05-23 18:44:32 +0200 |
commit | 7be36366beabfbb9fd480136e5ef44f055bb1bbe (patch) | |
tree | 247cd28e7ad33ac9da8a21aeca593f56f8e14a2d /Libraries | |
parent | ede44853d14c222cf7e6bd808051a1a1711b7f6e (diff) | |
download | serenity-7be36366beabfbb9fd480136e5ef44f055bb1bbe.zip |
LibWeb: Emit character/comment tokens lazily to accumulate more data
Instead of emitting data-bearing tokens immediately, do it lazily at
the next state change. This allows us to accumulate full bursts of
text in between tags instead of having one token per character. :^)
Diffstat (limited to 'Libraries')
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 33 | ||||
-rw-r--r-- | Libraries/LibWeb/Parser/HTMLTokenizer.h | 1 |
2 files changed, 31 insertions, 3 deletions
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index eace42d309..39e3f8b594 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -30,7 +30,11 @@ //#define TOKENIZER_TRACE -#define TODO ASSERT_NOT_REACHED +#define TODO() \ + do { \ + dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \ + ASSERT_NOT_REACHED(); \ + } while (0) #define SWITCH_TO(new_state) \ will_switch_to(State::new_state); \ @@ -115,9 +119,9 @@ void HTMLTokenizer::run() } ANYTHING_ELSE { - create_new_token(HTMLToken::Type::Character); + if (m_current_token.type() != HTMLToken::Type::Character) + create_new_token(HTMLToken::Type::Character); m_current_token.m_comment_or_character.data.append(current_input_character.value()); - emit_current_token(); continue; } } @@ -138,6 +142,14 @@ void HTMLTokenizer::run() create_new_token(HTMLToken::Type::StartTag); RECONSUME_IN(TagName); } + ON('?') + { + TODO(); + } + ANYTHING_ELSE + { + TODO(); + } } END_STATE @@ -699,12 +711,19 @@ void HTMLTokenizer::emit_current_token() builder.append("} }"); } + if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) { + builder.append(" { data: '"); + builder.append(m_current_token.m_comment_or_character.data.to_string()); + builder.append(" }"); + } + dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string(); m_current_token = {}; } void HTMLTokenizer::create_new_token(HTMLToken::Type type) { + flush_current_character_or_comment_if_needed(); m_current_token = {}; m_current_token.m_type = type; } @@ -716,6 +735,7 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input) void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) { + flush_current_character_or_comment_if_needed(); #ifdef TOKENIZER_TRACE dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state); #endif @@ -723,9 +743,16 @@ void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state) { + flush_current_character_or_comment_if_needed(); #ifdef TOKENIZER_TRACE dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state); #endif } +void HTMLTokenizer::flush_current_character_or_comment_if_needed() +{ + if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment) + emit_current_token(); +} + } diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index ec5adecb92..5573cdd46c 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -148,6 +148,7 @@ private: void will_switch_to(State); void will_reconsume_in(State); + void flush_current_character_or_comment_if_needed(); State m_state { State::Data }; State m_return_state { State::Data }; |