From ecd25ce6c7dec8bf362e930705afbe1a951ab700 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Tue, 26 May 2020 15:50:05 +0200 Subject: LibWeb: Allow HTML tokenizer to emit more than one token Tokens are now put on a queue when emitted, and we always pop from that queue when returning from next_token(). --- Libraries/LibWeb/Parser/HTMLTokenizer.cpp | 33 +++++++++++++++++++++---------- Libraries/LibWeb/Parser/HTMLTokenizer.h | 3 +++ 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp index ba4df5d474..1463067d1d 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp @@ -58,7 +58,16 @@ will_switch_to(State::new_state); \ m_state = State::new_state; \ will_emit(m_current_token); \ - return m_current_token; \ + m_queued_tokens.enqueue(m_current_token); \ + return m_queued_tokens.dequeue(); \ + } while (0) + +#define EMIT_CHARACTER_AND_RECONSUME_IN(codepoint, new_state) \ + do { \ + m_queued_tokens.enqueue(m_current_token); \ + will_reconsume_in(State::new_state); \ + m_state = State::new_state; \ + goto new_state; \ } while (0) #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor; @@ -90,21 +99,23 @@ m_has_emitted_eof = true; \ create_new_token(HTMLToken::Type::EndOfFile); \ will_emit(m_current_token); \ - return m_current_token; \ + m_queued_tokens.enqueue(m_current_token); \ + return m_queued_tokens.dequeue(); \ } while (0) -#define EMIT_CURRENT_TOKEN \ - do { \ - will_emit(m_current_token); \ - return m_current_token; \ +#define EMIT_CURRENT_TOKEN \ + do { \ + will_emit(m_current_token); \ + m_queued_tokens.enqueue(m_current_token); \ + return m_queued_tokens.dequeue(); \ } while (0) #define EMIT_CHARACTER(codepoint) \ do { \ create_new_token(HTMLToken::Type::Character); \ m_current_token.m_comment_or_character.data.append(codepoint); \ - will_emit(m_current_token); \ - return m_current_token; \ + m_queued_tokens.enqueue(m_current_token); \ + return m_queued_tokens.dequeue(); \ } while (0) #define EMIT_CURRENT_CHARACTER \ @@ -141,6 +152,9 @@ Optional HTMLTokenizer::peek_codepoint(size_t offset) const Optional HTMLTokenizer::next_token() { + if (!m_queued_tokens.is_empty()) + return m_queued_tokens.dequeue(); + for (;;) { auto current_input_character = next_codepoint(); switch (m_state) { @@ -1270,8 +1284,7 @@ Optional HTMLTokenizer::next_token() } ANYTHING_ELSE { - EMIT_CHARACTER('<'); - RECONSUME_IN(ScriptData); + EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData); } } END_STATE diff --git a/Libraries/LibWeb/Parser/HTMLTokenizer.h b/Libraries/LibWeb/Parser/HTMLTokenizer.h index fe18e9a211..2f674eaad8 100644 --- a/Libraries/LibWeb/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/Parser/HTMLTokenizer.h @@ -26,6 +26,7 @@ #pragma once +#include #include #include #include @@ -165,5 +166,7 @@ private: HTMLToken m_last_emitted_start_tag; bool m_has_emitted_eof { false }; + + Queue m_queued_tokens; }; } -- cgit v1.2.3