diff options
author | Ali Mohammad Pur <ali.mpfard@gmail.com> | 2021-05-20 23:11:41 +0430 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-05-20 22:06:45 +0200 |
commit | aa7939bc6c02b474a535ea0fb7c521b3e141fd21 (patch) | |
tree | fd1bc5a025faf5d390ec871d9ff53f7143372fc0 /Userland/Libraries/LibWeb | |
parent | fd982f6562c45eee94b0da262723d71429660693 (diff) | |
download | serenity-aa7939bc6c02b474a535ea0fb7c521b3e141fd21.zip |
LibWeb: Add position tracking information to HTML tokens
Diffstat (limited to 'Userland/Libraries/LibWeb')
4 files changed, 108 insertions, 21 deletions
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp index 77d2e2826e..fe2ae1f057 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp @@ -57,6 +57,10 @@ String HTMLToken::to_string() const builder.append("' }"); } + builder.appendff("@{}:{}-{}:{}", + m_start_position.line, m_start_position.column, + m_end_position.line, m_end_position.column); + return builder.to_string(); } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h index 06396ac6ab..c33a9947f8 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h @@ -164,12 +164,30 @@ public: String to_string() const; + const auto& start_position() const { return m_start_position; } + const auto& end_position() const { return m_end_position; } + + const auto& attributes() const + { + VERIFY(is_start_tag() || is_end_tag()); + return m_tag.attributes; + } + private: + struct Position { + size_t line { 0 }; + size_t column { 0 }; + }; + struct AttributeBuilder { StringBuilder prefix_builder; StringBuilder local_name_builder; StringBuilder namespace_builder; StringBuilder value_builder; + Position name_start_position; + Position value_start_position; + Position name_end_position; + Position value_end_position; }; Type m_type { Type::Invalid }; @@ -201,6 +219,9 @@ private: struct { StringBuilder data; } m_comment_or_character; + + Position m_start_position; + Position m_end_position; }; } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index d5c650689b..51ef6af76b 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -42,13 +42,13 @@ namespace Web::HTML { goto _StartOfFunction; \ } while (0) -#define RECONSUME_IN_RETURN_STATE \ - do { \ - will_reconsume_in(m_return_state); \ - m_state = m_return_state; \ - if (current_input_character.has_value()) \ - m_utf8_iterator = m_prev_utf8_iterator; \ - goto _StartOfFunction; \ +#define RECONSUME_IN_RETURN_STATE \ + do { \ + will_reconsume_in(m_return_state); \ + m_state = m_return_state; \ + if (current_input_character.has_value()) \ + restore_to(m_prev_utf8_iterator); \ + goto _StartOfFunction; \ } while (0) #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ @@ -81,9 +81,9 @@ namespace Web::HTML { } \ } while (0) -#define DONT_CONSUME_NEXT_INPUT_CHARACTER \ - do { \ - m_utf8_iterator = m_prev_utf8_iterator; \ +#define DONT_CONSUME_NEXT_INPUT_CHARACTER \ + do { \ + restore_to(m_prev_utf8_iterator); \ } while (0) #define ON(code_point) \ @@ -196,12 +196,27 @@ Optional<u32> HTMLTokenizer::next_code_point() { if (m_utf8_iterator == m_utf8_view.end()) return {}; - m_prev_utf8_iterator = m_utf8_iterator; - ++m_utf8_iterator; + skip(1); dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator); return *m_prev_utf8_iterator; } +void HTMLTokenizer::skip(size_t count) +{ + m_prev_utf8_iterator = m_utf8_iterator; + m_source_positions.append(m_source_positions.last()); + for (size_t i = 0; i < count; ++i) { + auto code_point = *m_utf8_iterator; + if (code_point == '\n') { + m_source_positions.last().column = 0; + m_source_positions.last().line++; + } else { + m_source_positions.last().column++; + } + ++m_utf8_iterator; + } +} + Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const { auto it = m_utf8_iterator; @@ -287,35 +302,42 @@ _StartOfFunction: { ON_WHITESPACE { + m_current_token.m_end_position = nth_last_position(1); SWITCH_TO(BeforeAttributeName); } ON('/') { + m_current_token.m_end_position = nth_last_position(1); SWITCH_TO(SelfClosingStartTag); } ON('>') { + m_current_token.m_end_position = nth_last_position(1); SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON_ASCII_UPPER_ALPHA { m_current_token.m_tag.tag_name.append(tolower(current_input_character.value())); + m_current_token.m_end_position = nth_last_position(0); continue; } ON(0) { log_parse_error(); m_current_token.m_tag.tag_name.append_code_point(0xFFFD); + m_current_token.m_end_position = nth_last_position(0); continue; } ON_EOF { log_parse_error(); + m_current_token.m_end_position = nth_last_position(1); EMIT_EOF; } ANYTHING_ELSE { m_current_token.m_tag.tag_name.append_code_point(current_input_character.value()); + m_current_token.m_end_position = nth_last_position(0); continue; } } @@ -966,6 +988,8 @@ _StartOfFunction: } ON('/') { + if (!m_current_token.m_tag.attributes.is_empty()) + m_current_token.m_tag.attributes.last().name_end_position = nth_last_position(1); RECONSUME_IN(AfterAttributeName); } ON('>') @@ -980,13 +1004,16 @@ _StartOfFunction: { log_parse_error(); auto new_attribute = HTMLToken::AttributeBuilder(); + new_attribute.name_start_position = nth_last_position(1); new_attribute.local_name_builder.append_code_point(current_input_character.value()); m_current_token.m_tag.attributes.append(new_attribute); SWITCH_TO(AttributeName); } ANYTHING_ELSE { - m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder()); + auto new_attribute = HTMLToken::AttributeBuilder(); + new_attribute.name_start_position = nth_last_position(1); + m_current_token.m_tag.attributes.append(move(new_attribute)); RECONSUME_IN(AttributeName); } } @@ -1081,6 +1108,7 @@ _StartOfFunction: } ON('=') { + m_current_token.m_tag.attributes.last().name_end_position = nth_last_position(1); SWITCH_TO(BeforeAttributeValue); } ON('>') @@ -1095,6 +1123,7 @@ _StartOfFunction: ANYTHING_ELSE { m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder()); + m_current_token.m_tag.attributes.last().name_start_position = m_source_positions.last(); RECONSUME_IN(AttributeName); } } @@ -1102,6 +1131,7 @@ _StartOfFunction: BEGIN_STATE(BeforeAttributeValue) { + m_current_token.m_tag.attributes.last().value_start_position = nth_last_position(1); ON_WHITESPACE { continue; @@ -1190,6 +1220,7 @@ _StartOfFunction: { ON_WHITESPACE { + m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2); SWITCH_TO(BeforeAttributeName); } ON('&') @@ -1199,6 +1230,7 @@ _StartOfFunction: } ON('>') { + m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2); SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data); } ON(0) @@ -1248,6 +1280,7 @@ _StartOfFunction: BEGIN_STATE(AfterAttributeValueQuoted) { + m_current_token.m_tag.attributes.last().value_end_position = nth_last_position(2); ON_WHITESPACE { SWITCH_TO(BeforeAttributeName); @@ -1514,10 +1547,7 @@ _StartOfFunction: auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset - 1)); if (match.has_value()) { - for (size_t i = 0; i < match.value().entity.length() - 1; ++i) { - m_prev_utf8_iterator = m_utf8_iterator; - ++m_utf8_iterator; - } + skip(match->entity.length() - 1); for (auto ch : match.value().entity) m_temporary_buffer.append(ch); @@ -2571,10 +2601,7 @@ bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitiv if (code_point.value() != (u32)string[i]) return false; } - for (size_t i = 0; i < string.length(); ++i) { - m_prev_utf8_iterator = m_utf8_iterator; - ++m_utf8_iterator; - } + skip(string.length()); return true; } @@ -2582,6 +2609,19 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type) { m_current_token = {}; m_current_token.m_type = type; + size_t offset = 0; + switch (type) { + case HTMLToken::Type::StartTag: + offset = 1; + break; + case HTMLToken::Type::EndTag: + offset = 2; + break; + default: + break; + } + + m_current_token.m_start_position = nth_last_position(offset); } HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) @@ -2591,6 +2631,7 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding) m_decoded_input = decoder->to_utf8(input); m_utf8_view = Utf8View(m_decoded_input); m_utf8_iterator = m_utf8_view.begin(); + m_source_positions.empend(0u, 0u); } void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state) @@ -2613,6 +2654,7 @@ void HTMLTokenizer::will_emit(HTMLToken& token) { if (token.is_start_tag()) m_last_emitted_start_tag = token; + token.m_end_position = m_source_positions.last(); } bool HTMLTokenizer::current_end_tag_token_is_appropriate() const @@ -2628,4 +2670,18 @@ bool HTMLTokenizer::consumed_as_part_of_an_attribute() const return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted; } +void HTMLTokenizer::restore_to(const Utf8CodepointIterator& new_iterator) +{ + if (new_iterator != m_prev_utf8_iterator) { + auto diff = m_prev_utf8_iterator - new_iterator; + if (diff > 0) { + for (ssize_t i = 0; i < diff; ++i) + m_source_positions.take_last(); + } else { + // Going forwards...? + TODO(); + } + } +} + } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h index 8c5e02d05d..5edfaf2271 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h @@ -117,6 +117,7 @@ public: String source() const { return m_decoded_input; } private: + void skip(size_t count); Optional<u32> next_code_point(); Optional<u32> peek_code_point(size_t offset) const; bool consume_next_if_match(const StringView&, CaseSensitivity = CaseSensitivity::CaseSensitive); @@ -141,6 +142,9 @@ private: bool consumed_as_part_of_an_attribute() const; + void restore_to(const Utf8CodepointIterator& new_iterator); + auto& nth_last_position(size_t n = 0) { return m_source_positions.at(m_source_positions.size() - 1 - n); } + State m_state { State::Data }; State m_return_state { State::Data }; @@ -165,6 +169,8 @@ private: u32 m_character_reference_code { 0 }; bool m_blocked { false }; + + Vector<HTMLToken::Position> m_source_positions; }; } |