summaryrefslogtreecommitdiff
path: root/Libraries
diff options
context:
space:
mode:
authorAndreas Kling <kling@serenityos.org>2020-05-23 18:43:09 +0200
committerAndreas Kling <kling@serenityos.org>2020-05-23 18:44:32 +0200
commit7be36366beabfbb9fd480136e5ef44f055bb1bbe (patch)
tree247cd28e7ad33ac9da8a21aeca593f56f8e14a2d /Libraries
parentede44853d14c222cf7e6bd808051a1a1711b7f6e (diff)
downloadserenity-7be36366beabfbb9fd480136e5ef44f055bb1bbe.zip
LibWeb: Emit character/comment tokens lazily to accumulate more data
Instead of emitting data-bearing tokens immediately, do it lazily at the next state change. This allows us to accumulate full bursts of text in between tags instead of having one token per character. :^)
Diffstat (limited to 'Libraries')
-rw-r--r--Libraries/LibWeb/Parser/HTMLTokenizer.cpp33
-rw-r--r--Libraries/LibWeb/Parser/HTMLTokenizer.h1
2 files changed, 31 insertions, 3 deletions
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
index eace42d309..39e3f8b594 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@@ -30,7 +30,11 @@
//#define TOKENIZER_TRACE
-#define TODO ASSERT_NOT_REACHED
+#define TODO() \
+ do { \
+ dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
+ ASSERT_NOT_REACHED(); \
+ } while (0)
#define SWITCH_TO(new_state) \
will_switch_to(State::new_state); \
@@ -115,9 +119,9 @@ void HTMLTokenizer::run()
}
ANYTHING_ELSE
{
- create_new_token(HTMLToken::Type::Character);
+ if (m_current_token.type() != HTMLToken::Type::Character)
+ create_new_token(HTMLToken::Type::Character);
m_current_token.m_comment_or_character.data.append(current_input_character.value());
- emit_current_token();
continue;
}
}
@@ -138,6 +142,14 @@ void HTMLTokenizer::run()
create_new_token(HTMLToken::Type::StartTag);
RECONSUME_IN(TagName);
}
+ ON('?')
+ {
+ TODO();
+ }
+ ANYTHING_ELSE
+ {
+ TODO();
+ }
}
END_STATE
@@ -699,12 +711,19 @@ void HTMLTokenizer::emit_current_token()
builder.append("} }");
}
+ if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) {
+ builder.append(" { data: '");
+ builder.append(m_current_token.m_comment_or_character.data.to_string());
+ builder.append(" }");
+ }
+
dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
m_current_token = {};
}
void HTMLTokenizer::create_new_token(HTMLToken::Type type)
{
+ flush_current_character_or_comment_if_needed();
m_current_token = {};
m_current_token.m_type = type;
}
@@ -716,6 +735,7 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input)
void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
{
+ flush_current_character_or_comment_if_needed();
#ifdef TOKENIZER_TRACE
dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
#endif
@@ -723,9 +743,16 @@ void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
{
+ flush_current_character_or_comment_if_needed();
#ifdef TOKENIZER_TRACE
dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
#endif
}
+void HTMLTokenizer::flush_current_character_or_comment_if_needed()
+{
+ if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
+ emit_current_token();
+}
+
}
diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h
index ec5adecb92..5573cdd46c 100644
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.h
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h
@@ -148,6 +148,7 @@ private:
void will_switch_to(State);
void will_reconsume_in(State);
+ void flush_current_character_or_comment_if_needed();
State m_state { State::Data };
State m_return_state { State::Data };