diff options
author | Sam Atkins <atkinssj@serenityos.org> | 2021-10-28 12:50:05 +0100 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-11-19 22:35:05 +0100 |
commit | 85e5586a27e5a1b7fa43cd46903a05984c1d3759 (patch) | |
tree | fb0ab3cc5c6eac5415b60261ee1d0f256b2545c1 /Userland/Libraries/LibWeb | |
parent | 9403cc42f9d0455100722814ace98390165d67ef (diff) | |
download | serenity-85e5586a27e5a1b7fa43cd46903a05984c1d3759.zip |
LibWeb: Add spec comments to CSS Tokenizer
Some of the code has been slightly rearranged to match the spec order,
but otherwise I've tried not to mess with it.
Diffstat (limited to 'Userland/Libraries/LibWeb')
-rw-r--r-- | Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp | 386 |
1 files changed, 342 insertions, 44 deletions
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp index 248613a033..545ee4cfbd 100644 --- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp +++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp @@ -343,14 +343,18 @@ Token Tokenizer::create_value_token(Token::Type type, u32 value) // https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point u32 Tokenizer::consume_escaped_code_point() { - auto input = next_code_point(); + // This section describes how to consume an escaped code point. + // It assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next + // input code point has already been verified to be part of a valid escape. + // It will return a code point. - if (is_eof(input)) { - log_parse_error(); - return REPLACEMENT_CHARACTER; - } + // Consume the next input code point. + auto input = next_code_point(); + // hex digit if (is_ascii_hex_digit(input)) { + // Consume as many hex digits as possible, but no more than 5. + // Note that this means 1-6 hex digits have been consumed in total. StringBuilder builder; builder.append_code_point(input); @@ -359,34 +363,50 @@ u32 Tokenizer::consume_escaped_code_point() builder.append_code_point(next_code_point()); } + // If the next input code point is whitespace, consume it as well. if (is_whitespace(peek_code_point())) { (void)next_code_point(); } + // Interpret the hex digits as a hexadecimal number. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16); + // If this number is zero, or is for a surrogate, or is greater than the maximum allowed + // code point, return U+FFFD REPLACEMENT CHARACTER (�). if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) { return REPLACEMENT_CHARACTER; } + // Otherwise, return the code point with that value. return unhexed; } - if (!input) { + // EOF + if (is_eof(input)) { + // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�). log_parse_error(); return REPLACEMENT_CHARACTER; } + // anything else + // Return the current input code point. return input; } // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token Token Tokenizer::consume_an_ident_like_token() { + // This section describes how to consume an ident-like token from a stream of code points. + // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>. + + // Consume a name, and let string be the result. auto string = consume_a_name(); + // If string’s value is an ASCII case-insensitive match for "url", and the next input code + // point is U+0028 LEFT PARENTHESIS ((), consume it. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) { (void)next_code_point(); + // While the next two input code points are whitespace, consume the next input code point. for (;;) { auto maybe_whitespace = peek_twin(); if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) { @@ -396,35 +416,54 @@ Token Tokenizer::consume_an_ident_like_token() (void)next_code_point(); } + // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('), + // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('), then create a + // <function-token> with its value set to string and return it. auto next_two = peek_twin(); - // if one of these ", ', ' "', " '" if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) { return create_value_token(Token::Type::Function, string); } + // Otherwise, consume a url token, and return it. return consume_a_url_token(); } + // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it. if (is_left_paren(peek_code_point())) { (void)next_code_point(); + // Create a <function-token> with its value set to string and return it. return create_value_token(Token::Type::Function, string); } + // Otherwise, create an <ident-token> with its value set to string and return it. return create_value_token(Token::Type::Ident, string); } // https://www.w3.org/TR/css-syntax-3/#consume-number CSSNumber Tokenizer::consume_a_number() { + // This section describes how to consume a number from a stream of code points. + // It returns a numeric value, and a type which is either "integer" or "number". + // + // Note: This algorithm does not do the verification of the first few code points + // that are necessary to ensure a number can be obtained from the stream. Ensure + // that the stream starts with a number before calling this algorithm. + + // Execute the following steps in order: + + // 1. Initially set type to "integer". Let repr be the empty string. StringBuilder repr; Token::NumberType type = Token::NumberType::Integer; + // 2. If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), + // consume it and append it to repr. auto next_input = peek_code_point(); if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) { repr.append_code_point(next_code_point()); } + // 3. While the next input code point is a digit, consume it and append it to repr. for (;;) { auto digits = peek_code_point(); if (!is_ascii_digit(digits)) @@ -433,13 +472,18 @@ CSSNumber Tokenizer::consume_a_number() repr.append_code_point(next_code_point()); } + // 4. If the next 2 input code points are U+002E FULL STOP (.) followed by a digit, then: auto maybe_number = peek_twin(); if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) { + // 1. Consume them. + // 2. Append them to repr. repr.append_code_point(next_code_point()); repr.append_code_point(next_code_point()); + // 3. Set type to "number". type = Token::NumberType::Number; + // 4. While the next input code point is a digit, consume it and append it to repr. for (;;) { auto digit = peek_code_point(); if (!is_ascii_digit(digit)) @@ -449,8 +493,14 @@ CSSNumber Tokenizer::consume_a_number() } } + // 5. If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or + // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS (-) + // or U+002B PLUS SIGN (+), followed by a digit, then: auto maybe_exp = peek_triplet(); if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) { + // 1. Consume them. + // 2. Append them to repr. + // FIXME: These conditions should be part of step 5 above. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) { if (is_ascii_digit(maybe_exp.third)) { repr.append_code_point(next_code_point()); @@ -462,8 +512,10 @@ CSSNumber Tokenizer::consume_a_number() repr.append_code_point(next_code_point()); } + // 3. Set type to "number". type = Token::NumberType::Number; + // 4. While the next input code point is a digit, consume it and append it to repr. for (;;) { auto digits = peek_code_point(); if (!is_ascii_digit(digits)) @@ -473,7 +525,11 @@ CSSNumber Tokenizer::consume_a_number() } } - return { repr.to_string(), convert_a_string_to_a_number(repr.string_view()), type }; + // 6. Convert repr to a number, and set the value to the returned value. + auto value = convert_a_string_to_a_number(repr.string_view()); + + // 7. Return value and type. + return { repr.to_string(), value, type }; } // https://www.w3.org/TR/css-syntax-3/#convert-string-to-number @@ -556,116 +612,188 @@ double Tokenizer::convert_a_string_to_a_number(StringView string) // https://www.w3.org/TR/css-syntax-3/#consume-name String Tokenizer::consume_a_name() { + // This section describes how to consume a name from a stream of code points. + // It returns a string containing the largest name that can be formed from adjacent + // code points in the stream, starting from the first. + // + // Note: This algorithm does not do the verification of the first few code points that + // are necessary to ensure the returned code points would constitute an <ident-token>. + // If that is the intended use, ensure that the stream starts with an identifier before + // calling this algorithm. + + // Let result initially be an empty string. StringBuilder result; + // Repeatedly consume the next input code point from the stream: for (;;) { auto input = next_code_point(); if (is_eof(input)) break; + // name code point if (is_name_code_point(input)) { + // Append the code point to result. result.append_code_point(input); continue; } + // the stream starts with a valid escape auto next = peek_code_point(); if (!is_eof(next) && is_valid_escape_sequence({ input, next })) { + // Consume an escaped code point. Append the returned code point to result. result.append_code_point(consume_escaped_code_point()); continue; } + // anything else + // Reconsume the current input code point. Return result. reconsume_current_input_code_point(); break; } return result.to_string(); } + +// https://www.w3.org/TR/css-syntax-3/#consume-url-token Token Tokenizer::consume_a_url_token() { + // This section describes how to consume a url token from a stream of code points. + // It returns either a <url-token> or a <bad-url-token>. + // + // Note: This algorithm assumes that the initial "url(" has already been consumed. + // This algorithm also assumes that it’s being called to consume an "unquoted" value, + // like url(foo). A quoted value, like url("foo"), is parsed as a <function-token>. + // Consume an ident-like token automatically handles this distinction; this algorithm + // shouldn’t be called directly otherwise. + + // 1. Initially create a <url-token> with its value set to the empty string. auto token = create_new_token(Token::Type::Url); - consume_as_much_whitespace_as_possible(); StringBuilder builder; + // 2. Consume as much whitespace as possible. + consume_as_much_whitespace_as_possible(); + auto make_token = [&]() { token.m_value = builder.to_string(); return token; }; + // 3. Repeatedly consume the next input code point from the stream: for (;;) { - + // NOTE: We peek here instead of consuming, so that we can peek a twin later + // to determine if it's a valid escape sequence. auto input = peek_code_point(); - if (is_eof(input)) { - log_parse_error(); + + // U+0029 RIGHT PARENTHESIS ()) + if (is_right_paren(input)) { + // Return the <url-token>. + (void)next_code_point(); // Not to spec, see NOTE above. return make_token(); } - if (is_right_paren(input)) { - (void)next_code_point(); + // EOF + if (is_eof(input)) { + // This is a parse error. Return the <url-token>. + log_parse_error(); return make_token(); } + // whitespace if (is_whitespace(input)) { + // Consume as much whitespace as possible. consume_as_much_whitespace_as_possible(); + + // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF, consume it + // and return the <url-token> (if EOF was encountered, this is a parse error); input = peek_code_point(); - if (is_eof(input)) { - log_parse_error(); + if (is_right_paren(input)) { + (void)next_code_point(); return make_token(); } - if (is_right_paren(input)) { + if (is_eof(input)) { + (void)next_code_point(); + log_parse_error(); return make_token(); } + // otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it. consume_the_remnants_of_a_bad_url(); return create_new_token(Token::Type::BadUrl); } + // U+0022 QUOTATION MARK (") + // U+0027 APOSTROPHE (') + // U+0028 LEFT PARENTHESIS (() + // non-printable code point if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) { + // This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it. log_parse_error(); - (void)next_code_point(); + (void)next_code_point(); // Not to spec, see NOTE above. consume_the_remnants_of_a_bad_url(); return create_new_token(Token::Type::BadUrl); } + // U+005C REVERSE SOLIDUS (\) if (is_reverse_solidus(input)) { + // If the stream starts with a valid escape, if (is_valid_escape_sequence(peek_twin())) { + // consume an escaped code point and append the returned code point to the <url-token>’s value. builder.append_code_point(consume_escaped_code_point()); } else { + // Otherwise, this is a parse error. log_parse_error(); - (void)next_code_point(); + (void)next_code_point(); // Not to spec, see NOTE above. + // Consume the remnants of a bad url, create a <bad-url-token>, and return it. consume_the_remnants_of_a_bad_url(); return create_new_token(Token::Type::BadUrl); } } - builder.append_code_point(next_code_point()); + // anything else + // Append the current input code point to the <url-token>’s value. + builder.append_code_point(input); + (void)next_code_point(); // Not to spec, see NOTE above. } } // https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url void Tokenizer::consume_the_remnants_of_a_bad_url() { - for (;;) { - auto next = peek_code_point(); - - if (is_eof(next)) { - return; - } + // This section describes how to consume the remnants of a bad url from a stream of code points, + // "cleaning up" after the tokenizer realizes that it’s in the middle of a <bad-url-token> rather + // than a <url-token>. It returns nothing; its sole use is to consume enough of the input stream + // to reach a recovery point where normal tokenizing can resume. - auto input = next; + // Repeatedly consume the next input code point from the stream: + for (;;) { + // NOTE: We peek instead of consuming so is_valid_escape_sequence() can peek a twin. + // So, we have to consume the code point later. + auto input = peek_code_point(); - if (is_right_paren(input)) { - (void)next_code_point(); + // U+0029 RIGHT PARENTHESIS ()) + // EOF + if (is_eof(input) || is_right_paren(input)) { + (void)next_code_point(); // Not to spec, see NOTE above. + // Return. return; } + // the input stream starts with a valid escape if (is_valid_escape_sequence(peek_twin())) { - [[maybe_unused]] auto cp = consume_escaped_code_point(); + // Consume an escaped code point. + // This allows an escaped right parenthesis ("\)") to be encountered without ending + // the <bad-url-token>. This is otherwise identical to the "anything else" clause. + (void)next_code_point(); // Not to spec, see NOTE above. + (void)consume_escaped_code_point(); } - (void)next_code_point(); + // anything else + // Do nothing. + + (void)next_code_point(); // Not to spec, see NOTE above. } } @@ -685,23 +813,35 @@ void Tokenizer::reconsume_current_input_code_point() // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token Token Tokenizer::consume_a_numeric_token() { + // This section describes how to consume a numeric token from a stream of code points. + // It returns either a <number-token>, <percentage-token>, or <dimension-token>. + + // Consume a number and let number be the result. auto number = consume_a_number(); + + // If the next 3 input code points would start an identifier, then: if (would_start_an_identifier()) { + // 1. Create a <dimension-token> with the same value and type flag as number, + // and a unit set initially to the empty string. auto token = create_new_token(Token::Type::Dimension); token.m_value = move(number.string); token.m_number_type = number.type; token.m_number_value = number.value; + // 2. Consume a name. Set the <dimension-token>’s unit to the returned value. auto unit = consume_a_name(); VERIFY(!unit.is_empty() && !unit.is_whitespace()); token.m_unit = move(unit); + // 3. Return the <dimension-token>. return token; } + // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it. if (is_percent(peek_code_point())) { (void)next_code_point(); + // Create a <percentage-token> with the same value as number, and return it. auto token = create_new_token(Token::Type::Percentage); token.m_value = move(number.string); token.m_number_type = number.type; @@ -709,6 +849,7 @@ Token Tokenizer::consume_a_numeric_token() return token; } + // Otherwise, create a <number-token> with the same value and type flag as number, and return it. auto token = create_new_token(Token::Type::Number); token.m_value = move(number.string); token.m_number_type = number.type; @@ -724,36 +865,67 @@ bool Tokenizer::would_start_a_number() const // https://www.w3.org/TR/css-syntax-3/#starts-with-a-number bool Tokenizer::would_start_a_number(U32Triplet values) { + // This section describes how to check if three code points would start a number. + // The algorithm described here can be called explicitly with three code points, + // or can be called with the input stream itself. In the latter case, the three + // code points in question are the current input code point and the next two input + // code points, in that order. + // + // Note: This algorithm will not consume any additional code points. + + // Look at the first code point: + + // U+002B PLUS SIGN (+) + // U+002D HYPHEN-MINUS (-) if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) { + // If the second code point is a digit, return true. if (is_ascii_digit(values.second)) return true; + // Otherwise, if the second code point is a U+002E FULL STOP (.) and the third + // code point is a digit, return true. if (is_full_stop(values.second) && is_ascii_digit(values.third)) return true; + // Otherwise, return false. return false; } + // U+002E FULL STOP (.) if (is_full_stop(values.first)) + // If the second code point is a digit, return true. Otherwise, return false. return is_ascii_digit(values.second); + // digit if (is_ascii_digit(values.first)) + // Return true. return true; + // anything else + // Return false. return false; } // https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape bool Tokenizer::is_valid_escape_sequence(U32Twin values) { - if (!is_reverse_solidus(values.first)) { + // This section describes how to check if two code points are a valid escape. + // The algorithm described here can be called explicitly with two code points, + // or can be called with the input stream itself. In the latter case, the two + // code points in question are the current input code point and the next input + // code point, in that order. + // + // Note: This algorithm will not consume any additional code point. + + // If the first code point is not U+005C REVERSE SOLIDUS (\), return false. + if (!is_reverse_solidus(values.first)) return false; - } - if (is_newline(values.second)) { + // Otherwise, if the second code point is a newline, return false. + if (is_newline(values.second)) return false; - } + // Otherwise, return true. return true; } @@ -765,28 +937,57 @@ bool Tokenizer::would_start_an_identifier() // https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier bool Tokenizer::would_start_an_identifier(U32Triplet values) { + // This section describes how to check if three code points would start an identifier. + // The algorithm described here can be called explicitly with three code points, or + // can be called with the input stream itself. In the latter case, the three code + // points in question are the current input code point and the next two input code + // points, in that order. + // + // Note: This algorithm will not consume any additional code points. + + // Look at the first code point: + + // U+002D HYPHEN-MINUS if (is_hyphen_minus(values.first)) { + // If the second code point is a name-start code point or a U+002D HYPHEN-MINUS, + // or the second and third code points are a valid escape, return true. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23())) return true; + // Otherwise, return false. return false; } + // name-start code point if (is_name_start_code_point(values.first)) { + // Return true. return true; } + // U+005C REVERSE SOLIDUS (\) if (is_reverse_solidus(values.first)) { + // If the first and second code points are a valid escape, return true. if (is_valid_escape_sequence(values.to_twin_12())) return true; + // Otherwise, return false. return false; } + // anything else + // Return false. return false; } // https://www.w3.org/TR/css-syntax-3/#consume-string-token Token Tokenizer::consume_string_token(u32 ending_code_point) { + // This section describes how to consume a string token from a stream of code points. + // It returns either a <string-token> or <bad-string-token>. + // + // This algorithm may be called with an ending code point, which denotes the code point + // that ends the string. If an ending code point is not specified, the current input + // code point is used. + + // Initially create a <string-token> with its value set to the empty string. auto token = create_new_token(Token::Type::String); StringBuilder builder; @@ -795,36 +996,50 @@ Token Tokenizer::consume_string_token(u32 ending_code_point) return token; }; + // Repeatedly consume the next input code point from the stream: for (;;) { auto input = next_code_point(); + // ending code point + if (input == ending_code_point) + return make_token(); + + // EOF if (is_eof(input)) { + // This is a parse error. Return the <string-token>. log_parse_error(); return make_token(); } - if (input == ending_code_point) - return make_token(); - + // newline if (is_newline(input)) { + // This is a parse error. Reconsume the current input code point, create a + // <bad-string-token>, and return it. reconsume_current_input_code_point(); return create_new_token(Token::Type::BadString); } + // U+005C REVERSE SOLIDUS (\) if (is_reverse_solidus(input)) { + // If the next input code point is EOF, do nothing. auto next_input = peek_code_point(); if (is_eof(next_input)) continue; + // Otherwise, if the next input code point is a newline, consume it. if (is_newline(next_input)) { (void)next_code_point(); continue; } + // Otherwise, (the stream starts with a valid escape) consume an escaped code + // point and append the returned code point to the <string-token>’s value. auto escaped = consume_escaped_code_point(); builder.append_code_point(escaped); } + // anything else + // Append the current input code point to the <string-token>’s value. builder.append_code_point(input); } } @@ -832,7 +1047,17 @@ Token Tokenizer::consume_string_token(u32 ending_code_point) // https://www.w3.org/TR/css-syntax-3/#consume-comment void Tokenizer::consume_comments() { + // This section describes how to consume comments from a stream of code points. + // It returns nothing. + start: + // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*), + // consume them and all following code points up to and including the first U+002A ASTERISK (*) + // followed by a U+002F SOLIDUS (/), or up to an EOF code point. Return to the start of this step. + // + // If the preceding paragraph ended by consuming an EOF code point, this is a parse error. + // + // Return nothing. auto twin = peek_twin(); if (!(is_solidus(twin.first) && is_asterisk(twin.second))) return; @@ -860,83 +1085,114 @@ start: // https://www.w3.org/TR/css-syntax-3/#consume-token Token Tokenizer::consume_a_token() { + // This section describes how to consume a token from a stream of code points. + // It will return a single token of any type. + + // Consume comments. consume_comments(); + // Consume the next input code point. auto input = next_code_point(); - if (is_eof(input)) { - return create_new_token(Token::Type::EndOfFile); - } - + // whitespace if (is_whitespace(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace"); + // Consume as much whitespace as possible. Return a <whitespace-token>. consume_as_much_whitespace_as_possible(); return create_new_token(Token::Type::Whitespace); } + // U+0022 QUOTATION MARK (") if (is_quotation_mark(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark"); + // Consume a string token and return it. return consume_string_token(input); } + // U+0023 NUMBER SIGN (#) if (is_number_sign(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign"); + // If the next input code point is a name code point or the next two input code points + // are a valid escape, then: auto next_input = peek_code_point(); auto maybe_escape = peek_twin(); if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) { + // 1. Create a <hash-token>. auto token = create_new_token(Token::Type::Hash); + // 2. If the next 3 input code points would start an identifier, set the <hash-token>’s + // type flag to "id". if (would_start_an_identifier()) token.m_hash_type = Token::HashType::Id; + // 3. Consume a name, and set the <hash-token>’s value to the returned string. auto name = consume_a_name(); token.m_value = move(name); + // 4. Return the <hash-token>. return token; } + // Otherwise, return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } + // U+0027 APOSTROPHE (') if (is_apostrophe(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe"); + // Consume a string token and return it. return consume_string_token(input); } + // U+0028 LEFT PARENTHESIS (() if (is_left_paren(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren"); + // Return a <(-token>. return create_new_token(Token::Type::OpenParen); } + // U+0029 RIGHT PARENTHESIS ()) if (is_right_paren(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren"); + // Return a <)-token>. return create_new_token(Token::Type::CloseParen); } + // U+002B PLUS SIGN (+) if (is_plus_sign(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign"); + // If the input stream starts with a number, reconsume the current input code point, + // consume a numeric token and return it. if (would_start_a_number()) { reconsume_current_input_code_point(); return consume_a_numeric_token(); } + // Otherwise, return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } + // U+002C COMMA (,) if (is_comma(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is comma"); + // Return a <comma-token>. return create_new_token(Token::Type::Comma); } + // U+002D HYPHEN-MINUS (-) if (is_hyphen_minus(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus"); + // If the input stream starts with a number, reconsume the current input code point, + // consume a numeric token, and return it. if (would_start_a_number()) { reconsume_current_input_code_point(); return consume_a_numeric_token(); } + // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E + // GREATER-THAN SIGN (->), consume them and return a <CDC-token>. auto next_twin = peek_twin(); if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) { (void)next_code_point(); @@ -945,38 +1201,51 @@ Token Tokenizer::consume_a_token() return create_new_token(Token::Type::CDC); } + // Otherwise, if the input stream starts with an identifier, reconsume the current + // input code point, consume an ident-like token, and return it. if (would_start_an_identifier()) { reconsume_current_input_code_point(); return consume_an_ident_like_token(); } + // Otherwise, return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } + // U+002E FULL STOP (.) if (is_full_stop(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop"); + // If the input stream starts with a number, reconsume the current input code point, + // consume a numeric token, and return it. if (would_start_a_number()) { reconsume_current_input_code_point(); return consume_a_numeric_token(); } + // Otherwise, return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } + // U+003A COLON (:) if (is_colon(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is colon"); + // Return a <colon-token>. return create_new_token(Token::Type::Colon); } + // U+003B SEMICOLON (;) if (is_semicolon(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon"); + // Return a <semicolon-token>. return create_new_token(Token::Type::Semicolon); } + // U+003C LESS-THAN SIGN (<) if (is_less_than_sign(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is less than"); + // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS + // U+002D HYPHEN-MINUS (!--), consume them and return a <CDO-token>. auto maybe_cdo = peek_triplet(); - if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) { (void)next_code_point(); (void)next_code_point(); @@ -985,64 +1254,93 @@ Token Tokenizer::consume_a_token() return create_new_token(Token::Type::CDO); } + // Otherwise, return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } + // U+0040 COMMERCIAL AT (@) if (is_at(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is at"); + // If the next 3 input code points would start an identifier, consume a name, create + // an <at-keyword-token> with its value set to the returned value, and return it. if (would_start_an_identifier()) { auto name = consume_a_name(); - return create_value_token(Token::Type::AtKeyword, name); } + // Otherwise, return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } + // U+005B LEFT SQUARE BRACKET ([) if (is_open_square_bracket(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is open square"); + // Return a <[-token>. return create_new_token(Token::Type::OpenSquare); } + // U+005C REVERSE SOLIDUS (\) if (is_reverse_solidus(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus"); + // If the input stream starts with a valid escape, reconsume the current input code point, + // consume an ident-like token, and return it. if (is_valid_escape_sequence({ input, peek_code_point() })) { reconsume_current_input_code_point(); return consume_an_ident_like_token(); } + // Otherwise, this is a parse error. Return a <delim-token> with its value set to the + // current input code point. log_parse_error(); return create_value_token(Token::Type::Delim, input); } + // U+005D RIGHT SQUARE BRACKET (]) if (is_closed_square_bracket(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square"); + // Return a <]-token>. return create_new_token(Token::Type::CloseSquare); } + // U+007B LEFT CURLY BRACKET ({) if (is_open_curly_bracket(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly"); + // Return a <{-token>. return create_new_token(Token::Type::OpenCurly); } + // U+007D RIGHT CURLY BRACKET (}) if (is_closed_curly_bracket(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly"); + // Return a <}-token>. return create_new_token(Token::Type::CloseCurly); } + // digit if (is_ascii_digit(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is digit"); + // Reconsume the current input code point, consume a numeric token, and return it. reconsume_current_input_code_point(); return consume_a_numeric_token(); } + // name-start code point if (is_name_start_code_point(input)) { dbgln_if(CSS_TOKENIZER_DEBUG, "is name start"); + // Reconsume the current input code point, consume an ident-like token, and return it. reconsume_current_input_code_point(); return consume_an_ident_like_token(); } + // EOF + if (is_eof(input)) { + // Return an <EOF-token>. + return create_new_token(Token::Type::EndOfFile); + } + + // anything else dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter"); + // Return a <delim-token> with its value set to the current input code point. return create_value_token(Token::Type::Delim, input); } |