diff options
author | Sam Atkins <atkinssj@serenityos.org> | 2022-10-03 16:38:46 +0100 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2022-10-03 17:09:41 +0100 |
commit | 164094e161de3df98a9d169161cf639af9c09a56 (patch) | |
tree | 6c20a9330732fa0f42a4fb3d473f168b4e104c52 /Userland/Libraries/LibWeb | |
parent | 97e174afcd1990d1aefcf603a0a8f72046486695 (diff) | |
download | serenity-164094e161de3df98a9d169161cf639af9c09a56.zip |
LibWeb: Bring CSS tokenization preprocessing closer to spec
This is based on an editorial change in the December 2021 version of
SYNTAX-3: https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/
They named this step "filter code points", so let's use that name.
Diffstat (limited to 'Userland/Libraries/LibWeb')
-rw-r--r-- | Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp | 74 |
1 files changed, 38 insertions, 36 deletions
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp index 2399e0d39a..cd450d5e11 100644 --- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp +++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp @@ -196,48 +196,50 @@ static inline bool is_E(u32 code_point) Tokenizer::Tokenizer(StringView input, String const& encoding) { - auto* decoder = TextCodec::decoder_for(encoding); - VERIFY(decoder); - - StringBuilder builder(input.length()); - - // Preprocess the stream, by doing the following: - // - Replace \r, \f and \r\n with \n - // - replace \0 and anything between U+D800 to U+DFFF with the replacement - // character. - // https://www.w3.org/TR/css-syntax-3/#input-preprocessing - bool last_was_carriage_return = false; - decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) { - if (code_point == '\r') { - if (last_was_carriage_return) { - builder.append('\n'); + // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points + auto filter_code_points = [](StringView input, auto const& encoding) -> String { + auto* decoder = TextCodec::decoder_for(encoding); + VERIFY(decoder); + + StringBuilder builder { input.length() }; + bool last_was_carriage_return = false; + + // To filter code points from a stream of (unfiltered) code points input: + decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) { + // Replace any U+000D CARRIAGE RETURN (CR) code points, + // U+000C FORM FEED (FF) code points, + // or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF) + // in input by a single U+000A LINE FEED (LF) code point. + if (code_point == '\r') { + if (last_was_carriage_return) { + builder.append('\n'); + } else { + last_was_carriage_return = true; + } } else { - last_was_carriage_return = true; - } - } else { - if (last_was_carriage_return) { - builder.append('\n'); - } + if (last_was_carriage_return) + builder.append('\n'); + + if (code_point == '\n') { + if (!last_was_carriage_return) + builder.append('\n'); - if (code_point == '\n') { - if (!last_was_carriage_return) { + } else if (code_point == '\f') { builder.append('\n'); + // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�). + } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) { + builder.append_code_point(REPLACEMENT_CHARACTER); + } else { + builder.append_code_point(code_point); } - } else if (code_point == '\f') { - builder.append('\n'); - } else if (code_point == 0x00) { - builder.append_code_point(REPLACEMENT_CHARACTER); - } else if (code_point >= 0xD800 && code_point <= 0xDFFF) { - builder.append_code_point(REPLACEMENT_CHARACTER); - } else { - builder.append_code_point(code_point); - } - last_was_carriage_return = false; - } - }); + last_was_carriage_return = false; + } + }); + return builder.to_string(); + }; - m_decoded_input = builder.to_string(); + m_decoded_input = filter_code_points(input, encoding); m_utf8_view = Utf8View(m_decoded_input); m_utf8_iterator = m_utf8_view.begin(); } |