LibWeb: Convert numeric tokens to numbers in CSS Tokenizer

The spec wants us to produce numeric values as the Tokenizer sees them, rather than waiting until the parse stage. This is a first step towards that.
author: Sam Atkins <atkinssj@serenityos.org> 2021-11-17 21:01:51 +0000
committer: Andreas Kling <kling@serenityos.org> 2021-11-19 22:35:05 +0100
commit: f6869797a7a56de4db4d9d8a06bfffe6ca520d81 (patch)
tree: 6cb05453febda3b2d8e730bc614e0bbb16522042 /Userland
parent: d2ef8b29e8becdec7c0a44e1f9c1f5cb64059d5c (diff)
download: serenity-f6869797a7a56de4db4d9d8a06bfffe6ca520d81.zip
3 files changed, 90 insertions, 5 deletions
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Token.h b/Userland/Libraries/LibWeb/CSS/Parser/Token.h
index 1449d9b35b..b7e2a96480 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Token.h
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Token.h
@@ -152,6 +152,7 @@ private:
     StringBuilder m_unit;
     HashType m_hash_type { HashType::Unrestricted };
     NumberType m_number_type { NumberType::Integer };
+    double m_number_value { 0 };
 
     Position m_start_position;
     Position m_end_position;
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
index d7f2808720..c139e9fbf5 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
@@ -11,6 +11,7 @@
 #include <AK/Vector.h>
 #include <LibTextCodec/Decoder.h>
 #include <LibWeb/CSS/Parser/Tokenizer.h>
+#include <math.h>
 
 // U+FFFD REPLACEMENT CHARACTER (�)
 #define REPLACEMENT_CHARACTER 0xFFFD
@@ -469,7 +470,84 @@ CSSNumber Tokenizer::consume_a_number()
         }
     }
 
-    return { repr.to_string(), type };
+    return { repr.to_string(), convert_a_string_to_a_number(repr.string_view()), type };
+}
+
+// https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
+double Tokenizer::convert_a_string_to_a_number(StringView string)
+{
+    auto code_point_at = [&](size_t index) -> u32 {
+        if (index < string.length())
+            return string[index];
+        return TOKENIZER_EOF;
+    };
+
+    // This algorithm does not do any verification to ensure that the string contains only a number.
+    // Ensure that the string contains only a valid CSS number before calling this algorithm.
+
+    // Divide the string into seven components, in order from left to right:
+    size_t position = 0;
+
+    // 1. A sign: a single U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), or the empty string.
+    //    Let s [sign] be the number -1 if the sign is U+002D HYPHEN-MINUS (-); otherwise, let s be the number 1.
+    int sign = 1;
+    if (is_plus_sign(code_point_at(position)) || is_hyphen_minus(code_point_at(position))) {
+        sign = is_hyphen_minus(code_point_at(position)) ? -1 : 1;
+        position++;
+    }
+
+    // 2. An integer part: zero or more digits.
+    //    If there is at least one digit, let i [integer_part] be the number formed by interpreting the digits
+    //    as a base-10 integer; otherwise, let i be the number 0.
+    double integer_part = 0;
+    while (is_ascii_digit(code_point_at(position))) {
+        integer_part = (integer_part * 10) + (code_point_at(position) - '0');
+        position++;
+    }
+
+    // 3. A decimal point: a single U+002E FULL STOP (.), or the empty string.
+    if (is_full_stop(code_point_at(position)))
+        position++;
+
+    // 4. A fractional part: zero or more digits.
+    //    If there is at least one digit, let f [fractional_part] be the number formed by interpreting the digits
+    //    as a base-10 integer and d [fractional_digits] be the number of digits; otherwise, let f and d be the number 0.
+    double fractional_part = 0;
+    int fractional_digits = 0;
+    while (is_ascii_digit(code_point_at(position))) {
+        fractional_part = (fractional_part * 10) + (code_point_at(position) - '0');
+        position++;
+        fractional_digits++;
+    }
+
+    // 5. An exponent indicator: a single U+0045 LATIN CAPITAL LETTER E (E) or U+0065 LATIN SMALL LETTER E (e),
+    //    or the empty string.
+    if (is_e(code_point_at(position)) || is_E(code_point_at(position)))
+        position++;
+
+    // 6. An exponent sign: a single U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), or the empty string.
+    //    Let t [exponent_sign] be the number -1 if the sign is U+002D HYPHEN-MINUS (-); otherwise, let t be the number 1.
+    int exponent_sign = 1;
+    if (is_plus_sign(code_point_at(position)) || is_hyphen_minus(code_point_at(position))) {
+        exponent_sign = is_hyphen_minus(code_point_at(position)) ? -1 : 1;
+        position++;
+    }
+
+    // 7. An exponent: zero or more digits.
+    //    If there is at least one digit, let e [exponent] be the number formed by interpreting the digits as a
+    //    base-10 integer; otherwise, let e be the number 0.
+    double exponent = 0;
+    while (is_ascii_digit(code_point_at(position))) {
+        exponent = (exponent * 10) + (code_point_at(position) - '0');
+        position++;
+    }
+
+    // NOTE: We checked before calling this function that the string is a valid number,
+    //       so if there is anything at the end, something has gone wrong!
+    VERIFY(position == string.length());
+
+    // Return the number s·(i + f·10^-d)·10^te.
+    return sign * (integer_part + fractional_part * pow(10, -fractional_digits)) * pow(10, exponent_sign * exponent);
 }
 
 // https://www.w3.org/TR/css-syntax-3/#consume-name
@@ -601,8 +679,9 @@ Token Tokenizer::consume_a_numeric_token()
     auto number = consume_a_number();
     if (would_start_an_identifier()) {
         auto token = create_new_token(Token::Type::Dimension);
-        token.m_value.append(number.value);
+        token.m_value.append(number.string);
         token.m_number_type = number.type;
+        token.m_number_value = number.value;
 
         auto unit = consume_a_name();
         VERIFY(!unit.is_empty() && !unit.is_whitespace());
@@ -615,13 +694,16 @@ Token Tokenizer::consume_a_numeric_token()
         (void)next_code_point();
 
         auto token = create_new_token(Token::Type::Percentage);
-        token.m_value.append(number.value);
+        token.m_value.append(number.string);
+        token.m_number_type = number.type;
+        token.m_number_value = number.value;
         return token;
     }
 
     auto token = create_new_token(Token::Type::Number);
-    token.m_value.append(number.value);
+    token.m_value.append(number.string);
     token.m_number_type = number.type;
+    token.m_number_value = number.value;
     return token;
 }
 
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
index 201ac94ae2..1e60df0a7c 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
@@ -59,7 +59,8 @@ public:
 
 class CSSNumber {
 public:
-    String value;
+    String string;
+    double value { 0 };
     Token::NumberType type {};
 };
 
@@ -86,6 +87,7 @@ private:
     [[nodiscard]] Token consume_a_numeric_token();
     [[nodiscard]] Token consume_an_ident_like_token();
     [[nodiscard]] CSSNumber consume_a_number();
+    [[nodiscard]] double convert_a_string_to_a_number(StringView);
     [[nodiscard]] String consume_a_name();
     [[nodiscard]] u32 consume_escaped_code_point();
     [[nodiscard]] Token consume_a_url_token();
author	Sam Atkins <atkinssj@serenityos.org>	2021-11-17 21:01:51 +0000
committer	Andreas Kling <kling@serenityos.org>	2021-11-19 22:35:05 +0100
commit	f6869797a7a56de4db4d9d8a06bfffe6ca520d81 (patch)
tree	6cb05453febda3b2d8e730bc614e0bbb16522042 /Userland
parent	d2ef8b29e8becdec7c0a44e1f9c1f5cb64059d5c (diff)
download	serenity-f6869797a7a56de4db4d9d8a06bfffe6ca520d81.zip