summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibWeb
diff options
context:
space:
mode:
authorSam Atkins <atkinssj@serenityos.org>2022-10-03 16:38:46 +0100
committerLinus Groh <mail@linusgroh.de>2022-10-03 17:09:41 +0100
commit164094e161de3df98a9d169161cf639af9c09a56 (patch)
tree6c20a9330732fa0f42a4fb3d473f168b4e104c52 /Userland/Libraries/LibWeb
parent97e174afcd1990d1aefcf603a0a8f72046486695 (diff)
downloadserenity-164094e161de3df98a9d169161cf639af9c09a56.zip
LibWeb: Bring CSS tokenization preprocessing closer to spec
This is based on an editorial change in the December 2021 version of SYNTAX-3: https://www.w3.org/TR/2021/CRD-css-syntax-3-20211224/ They named this step "filter code points", so let's use that name.
Diffstat (limited to 'Userland/Libraries/LibWeb')
-rw-r--r--Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp74
1 files changed, 38 insertions, 36 deletions
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
index 2399e0d39a..cd450d5e11 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
@@ -196,48 +196,50 @@ static inline bool is_E(u32 code_point)
Tokenizer::Tokenizer(StringView input, String const& encoding)
{
- auto* decoder = TextCodec::decoder_for(encoding);
- VERIFY(decoder);
-
- StringBuilder builder(input.length());
-
- // Preprocess the stream, by doing the following:
- // - Replace \r, \f and \r\n with \n
- // - replace \0 and anything between U+D800 to U+DFFF with the replacement
- // character.
- // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
- bool last_was_carriage_return = false;
- decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
- if (code_point == '\r') {
- if (last_was_carriage_return) {
- builder.append('\n');
+ // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
+ auto filter_code_points = [](StringView input, auto const& encoding) -> String {
+ auto* decoder = TextCodec::decoder_for(encoding);
+ VERIFY(decoder);
+
+ StringBuilder builder { input.length() };
+ bool last_was_carriage_return = false;
+
+ // To filter code points from a stream of (unfiltered) code points input:
+ decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
+ // Replace any U+000D CARRIAGE RETURN (CR) code points,
+ // U+000C FORM FEED (FF) code points,
+ // or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
+ // in input by a single U+000A LINE FEED (LF) code point.
+ if (code_point == '\r') {
+ if (last_was_carriage_return) {
+ builder.append('\n');
+ } else {
+ last_was_carriage_return = true;
+ }
} else {
- last_was_carriage_return = true;
- }
- } else {
- if (last_was_carriage_return) {
- builder.append('\n');
- }
+ if (last_was_carriage_return)
+ builder.append('\n');
+
+ if (code_point == '\n') {
+ if (!last_was_carriage_return)
+ builder.append('\n');
- if (code_point == '\n') {
- if (!last_was_carriage_return) {
+ } else if (code_point == '\f') {
builder.append('\n');
+ // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�).
+ } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) {
+ builder.append_code_point(REPLACEMENT_CHARACTER);
+ } else {
+ builder.append_code_point(code_point);
}
- } else if (code_point == '\f') {
- builder.append('\n');
- } else if (code_point == 0x00) {
- builder.append_code_point(REPLACEMENT_CHARACTER);
- } else if (code_point >= 0xD800 && code_point <= 0xDFFF) {
- builder.append_code_point(REPLACEMENT_CHARACTER);
- } else {
- builder.append_code_point(code_point);
- }
- last_was_carriage_return = false;
- }
- });
+ last_was_carriage_return = false;
+ }
+ });
+ return builder.to_string();
+ };
- m_decoded_input = builder.to_string();
+ m_decoded_input = filter_code_points(input, encoding);
m_utf8_view = Utf8View(m_decoded_input);
m_utf8_iterator = m_utf8_view.begin();
}