LibWeb: Port CSS Tokenizer to new Strings

Specifically, this uses FlyString, because the data gets held long-term as a FlyString anyway.
author: Sam Atkins <atkinssj@serenityos.org> 2023-02-15 11:24:38 +0000
committer: Tim Flynn <trflynn89@pm.me> 2023-02-15 12:48:26 -0500
commit: 3685a8813a20cb6ef93375206d75cf55a81a0da9 (patch)
tree: e7423df211087c9affaedc8917887ae7ad4179e0 /Userland/Libraries/LibWeb
parent: a419039bb2e10dad80ae5b34cc2b3b0189f65a6c (diff)
download: serenity-3685a8813a20cb6ef93375206d75cf55a81a0da9.zip
2 files changed, 27 insertions, 27 deletions
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
index c11732687f..baa16d709a 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
@@ -12,7 +12,7 @@
 #include <AK/Vector.h>
 #include <LibTextCodec/Decoder.h>
 #include <LibWeb/CSS/Parser/Tokenizer.h>
-#include <math.h>
+#include <LibWeb/Infra/Strings.h>
 
 namespace Web::CSS::Parser {
 
@@ -198,7 +198,7 @@ static inline bool is_E(u32 code_point)
 Tokenizer::Tokenizer(StringView input, StringView encoding)
 {
     // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
-    auto filter_code_points = [](StringView input, auto encoding) -> DeprecatedString {
+    auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> {
         auto* decoder = TextCodec::decoder_for(encoding);
         VERIFY(decoder);
 
@@ -237,10 +237,10 @@ Tokenizer::Tokenizer(StringView input, StringView encoding)
                 last_was_carriage_return = false;
             }
         });
-        return builder.to_deprecated_string();
+        return builder.to_string();
     };
 
-    m_decoded_input = filter_code_points(input, encoding);
+    m_decoded_input = filter_code_points(input, encoding).release_value_but_fixme_should_propagate_errors();
     m_utf8_view = Utf8View(m_decoded_input);
     m_utf8_iterator = m_utf8_view.begin();
 }
@@ -352,11 +352,11 @@ Token Tokenizer::create_eof_token()
     return create_new_token(Token::Type::EndOfFile);
 }
 
-Token Tokenizer::create_value_token(Token::Type type, DeprecatedString value)
+Token Tokenizer::create_value_token(Token::Type type, FlyString&& value)
 {
     Token token;
     token.m_type = type;
-    token.m_value = FlyString::from_utf8(value.view()).release_value_but_fixme_should_propagate_errors();
+    token.m_value = move(value);
     return token;
 }
 
@@ -364,7 +364,7 @@ Token Tokenizer::create_value_token(Token::Type type, u32 value)
 {
     Token token = {};
     token.m_type = type;
-    token.m_value = FlyString(String::from_code_point(value));
+    token.m_value = String::from_code_point(value);
     return token;
 }
 
@@ -397,7 +397,7 @@ u32 Tokenizer::consume_escaped_code_point()
         }
 
         // Interpret the hex digits as a hexadecimal number.
-        auto unhexed = strtoul(builder.to_deprecated_string().characters(), nullptr, 16);
+        auto unhexed = AK::StringUtils::convert_to_uint_from_hex<u32>(builder.string_view()).value_or(0);
         // If this number is zero, or is for a surrogate, or is greater than the maximum allowed
         // code point, return U+FFFD REPLACEMENT CHARACTER (�).
         if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
@@ -427,11 +427,11 @@ Token Tokenizer::consume_an_ident_like_token()
     // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>.
 
     // Consume an ident sequence, and let string be the result.
-    auto string = consume_an_ident_sequence();
+    auto string = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
 
     // If string’s value is an ASCII case-insensitive match for "url", and the next input code
     // point is U+0028 LEFT PARENTHESIS ((), consume it.
-    if (string.equals_ignoring_case("url"sv) && is_left_paren(peek_code_point())) {
+    if (Infra::is_ascii_case_insensitive_match(string, "url"sv) && is_left_paren(peek_code_point())) {
         (void)next_code_point();
 
         // While the next two input code points are whitespace, consume the next input code point.
@@ -449,7 +449,7 @@ Token Tokenizer::consume_an_ident_like_token()
         // <function-token> with its value set to string and return it.
         auto next_two = peek_twin();
         if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
-            return create_value_token(Token::Type::Function, string);
+            return create_value_token(Token::Type::Function, move(string));
         }
 
         // Otherwise, consume a url token, and return it.
@@ -461,11 +461,11 @@ Token Tokenizer::consume_an_ident_like_token()
         (void)next_code_point();
 
         // Create a <function-token> with its value set to string and return it.
-        return create_value_token(Token::Type::Function, string);
+        return create_value_token(Token::Type::Function, move(string));
     }
 
     // Otherwise, create an <ident-token> with its value set to string and return it.
-    return create_value_token(Token::Type::Ident, string);
+    return create_value_token(Token::Type::Ident, move(string));
 }
 
 // https://www.w3.org/TR/css-syntax-3/#consume-number
@@ -573,7 +573,7 @@ float Tokenizer::convert_a_string_to_a_number(StringView string)
 }
 
 // https://www.w3.org/TR/css-syntax-3/#consume-name
-DeprecatedString Tokenizer::consume_an_ident_sequence()
+ErrorOr<FlyString> Tokenizer::consume_an_ident_sequence()
 {
     // This section describes how to consume an ident sequence from a stream of code points.
     // It returns a string containing the largest name that can be formed from adjacent
@@ -597,14 +597,14 @@ DeprecatedString Tokenizer::consume_an_ident_sequence()
         // name code point
         if (is_ident_code_point(input)) {
             // Append the code point to result.
-            result.append_code_point(input);
+            TRY(result.try_append_code_point(input));
             continue;
         }
 
         // the stream starts with a valid escape
         if (is_valid_escape_sequence(start_of_input_stream_twin())) {
             // Consume an escaped code point. Append the returned code point to result.
-            result.append_code_point(consume_escaped_code_point());
+            TRY(result.try_append_code_point(consume_escaped_code_point()));
             continue;
         }
 
@@ -614,7 +614,7 @@ DeprecatedString Tokenizer::consume_an_ident_sequence()
         break;
     }
 
-    return result.to_deprecated_string();
+    return result.to_fly_string();
 }
 
 // https://www.w3.org/TR/css-syntax-3/#consume-url-token
@@ -778,10 +778,10 @@ Token Tokenizer::consume_a_numeric_token()
         token.m_number_value = number;
 
         // 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value.
-        auto unit = consume_an_ident_sequence();
+        auto unit = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
         VERIFY(!unit.is_empty());
         // NOTE: We intentionally store this in the `value`, to save space.
-        token.m_value = FlyString::from_utf8(unit.view()).release_value_but_fixme_should_propagate_errors();
+        token.m_value = move(unit);
 
         // 3. Return the <dimension-token>.
         return token;
@@ -1065,8 +1065,8 @@ Token Tokenizer::consume_a_token()
                 token.m_hash_type = Token::HashType::Id;
 
             // 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string.
-            auto name = consume_an_ident_sequence();
-            token.m_value = FlyString::from_utf8(name.view()).release_value_but_fixme_should_propagate_errors();
+            auto name = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
+            token.m_value = move(name);
 
             // 4. Return the <hash-token>.
             return token;
@@ -1201,8 +1201,8 @@ Token Tokenizer::consume_a_token()
         // If the next 3 input code points would start an ident sequence, consume an ident sequence, create
         // an <at-keyword-token> with its value set to the returned value, and return it.
         if (would_start_an_ident_sequence(peek_triplet())) {
-            auto name = consume_an_ident_sequence();
-            return create_value_token(Token::Type::AtKeyword, name);
+            auto name = consume_an_ident_sequence().release_value_but_fixme_should_propagate_errors();
+            return create_value_token(Token::Type::AtKeyword, move(name));
         }
 
         // Otherwise, return a <delim-token> with its value set to the current input code point.
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
index 83b24d11ea..c7d846c8ba 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
@@ -1,12 +1,12 @@
 /*
  * Copyright (c) 2020-2021, the SerenityOS developers.
+ * Copyright (c) 2021-2023, Sam Atkins <atkinssj@serenityos.org>
  *
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
 #pragma once
 
-#include <AK/DeprecatedString.h>
 #include <AK/Optional.h>
 #include <AK/StringView.h>
 #include <AK/Types.h>
@@ -76,7 +76,7 @@ private:
     [[nodiscard]] U32Triplet start_of_input_stream_triplet();
 
     [[nodiscard]] static Token create_new_token(Token::Type);
-    [[nodiscard]] static Token create_value_token(Token::Type, DeprecatedString value);
+    [[nodiscard]] static Token create_value_token(Token::Type, FlyString&& value);
     [[nodiscard]] static Token create_value_token(Token::Type, u32 value);
     [[nodiscard]] Token consume_a_token();
     [[nodiscard]] Token consume_string_token(u32 ending_code_point);
@@ -84,7 +84,7 @@ private:
     [[nodiscard]] Token consume_an_ident_like_token();
     [[nodiscard]] Number consume_a_number();
     [[nodiscard]] float convert_a_string_to_a_number(StringView);
-    [[nodiscard]] DeprecatedString consume_an_ident_sequence();
+    [[nodiscard]] ErrorOr<FlyString> consume_an_ident_sequence();
     [[nodiscard]] u32 consume_escaped_code_point();
     [[nodiscard]] Token consume_a_url_token();
     void consume_the_remnants_of_a_bad_url();
@@ -95,7 +95,7 @@ private:
     [[nodiscard]] static bool would_start_an_ident_sequence(U32Triplet);
     [[nodiscard]] static bool would_start_a_number(U32Triplet);
 
-    DeprecatedString m_decoded_input;
+    String m_decoded_input;
     Utf8View m_utf8_view;
     AK::Utf8CodePointIterator m_utf8_iterator;
     AK::Utf8CodePointIterator m_prev_utf8_iterator;
author	Sam Atkins <atkinssj@serenityos.org>	2023-02-15 11:24:38 +0000
committer	Tim Flynn <trflynn89@pm.me>	2023-02-15 12:48:26 -0500
commit	3685a8813a20cb6ef93375206d75cf55a81a0da9 (patch)
tree	e7423df211087c9affaedc8917887ae7ad4179e0 /Userland/Libraries/LibWeb
parent	a419039bb2e10dad80ae5b34cc2b3b0189f65a6c (diff)
download	serenity-3685a8813a20cb6ef93375206d75cf55a81a0da9.zip