LibWeb: Propagate errors from CSS Tokenizer construction

Instead of constructing a Tokenizer and then calling parse() on it, we now call `Tokenizer::tokenize(...)` directly. (Renamed from `parse()` because this is a Tokenizer, not a Parser.)
author: Sam Atkins <atkinssj@serenityos.org> 2023-03-06 14:19:39 +0000
committer: Andreas Kling <kling@serenityos.org> 2023-03-07 00:43:36 +0100
commit: 17618989a39c01656ac1e23c015c5803a9542314 (patch)
tree: 7f85e8ed3c05b226e877719cde709d72389829ef /Userland
parent: 98ee2fcd1beddbfd58fd1f17b3595a8e4d0ae414 (diff)
download: serenity-17618989a39c01656ac1e23c015c5803a9542314.zip
5 files changed, 50 insertions, 47 deletions
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp
index 1021994fe0..16258b3f9e 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Parser.cpp
@@ -84,8 +84,7 @@ AK::URL ParsingContext::complete_url(StringView relative_url) const
 
 Parser::Parser(ParsingContext const& context, StringView input, StringView encoding)
     : m_context(context)
-    , m_tokenizer(input, encoding)
-    , m_tokens(m_tokenizer.parse())
+    , m_tokens(Tokenizer::tokenize(input, encoding).release_value_but_fixme_should_propagate_errors())
     , m_token_stream(TokenStream(m_tokens))
 {
 }
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Parser.h b/Userland/Libraries/LibWeb/CSS/Parser/Parser.h
index c518f8eda3..e4f5d71f88 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Parser.h
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Parser.h
@@ -360,7 +360,6 @@ private:
 
     ParsingContext m_context;
 
-    Tokenizer m_tokenizer;
     Vector<Token> m_tokens;
     TokenStream<Token> m_token_stream;
 };
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
index f808bbaea7..d739e48166 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
@@ -195,7 +195,7 @@ static inline bool is_E(u32 code_point)
     return code_point == 0x45;
 }
 
-Tokenizer::Tokenizer(StringView input, StringView encoding)
+ErrorOr<Vector<Token>> Tokenizer::tokenize(StringView input, StringView encoding)
 {
     // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
     auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> {
@@ -206,48 +206,53 @@ Tokenizer::Tokenizer(StringView input, StringView encoding)
         bool last_was_carriage_return = false;
 
         // To filter code points from a stream of (unfiltered) code points input:
-        decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) -> ErrorOr<void> {
-                   // Replace any U+000D CARRIAGE RETURN (CR) code points,
-                   // U+000C FORM FEED (FF) code points,
-                   // or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
-                   // in input by a single U+000A LINE FEED (LF) code point.
-                   if (code_point == '\r') {
-                       if (last_was_carriage_return) {
-                           TRY(builder.try_append('\n'));
-                       } else {
-                           last_was_carriage_return = true;
-                       }
-                   } else {
-                       if (last_was_carriage_return)
-                           TRY(builder.try_append('\n'));
-
-                       if (code_point == '\n') {
-                           if (!last_was_carriage_return)
-                               TRY(builder.try_append('\n'));
-
-                       } else if (code_point == '\f') {
-                           TRY(builder.try_append('\n'));
-                           // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�).
-                       } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) {
-                           TRY(builder.try_append_code_point(REPLACEMENT_CHARACTER));
-                       } else {
-                           TRY(builder.try_append_code_point(code_point));
-                       }
-
-                       last_was_carriage_return = false;
-                   }
-                   return {};
-               })
-            .release_value_but_fixme_should_propagate_errors();
+        TRY(decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) -> ErrorOr<void> {
+            // Replace any U+000D CARRIAGE RETURN (CR) code points,
+            // U+000C FORM FEED (FF) code points,
+            // or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
+            // in input by a single U+000A LINE FEED (LF) code point.
+            if (code_point == '\r') {
+                if (last_was_carriage_return) {
+                    TRY(builder.try_append('\n'));
+                } else {
+                    last_was_carriage_return = true;
+                }
+            } else {
+                if (last_was_carriage_return)
+                    TRY(builder.try_append('\n'));
+
+                if (code_point == '\n') {
+                    if (!last_was_carriage_return)
+                        TRY(builder.try_append('\n'));
+
+                } else if (code_point == '\f') {
+                    TRY(builder.try_append('\n'));
+                    // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�).
+                } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) {
+                    TRY(builder.try_append_code_point(REPLACEMENT_CHARACTER));
+                } else {
+                    TRY(builder.try_append_code_point(code_point));
+                }
+
+                last_was_carriage_return = false;
+            }
+            return {};
+        }));
         return builder.to_string();
     };
 
-    m_decoded_input = filter_code_points(input, encoding).release_value_but_fixme_should_propagate_errors();
-    m_utf8_view = Utf8View(m_decoded_input);
-    m_utf8_iterator = m_utf8_view.begin();
+    Tokenizer tokenizer { TRY(filter_code_points(input, encoding)) };
+    return tokenizer.tokenize();
+}
+
+Tokenizer::Tokenizer(String decoded_input)
+    : m_decoded_input(move(decoded_input))
+    , m_utf8_view(m_decoded_input)
+    , m_utf8_iterator(m_utf8_view.begin())
+{
 }
 
-Vector<Token> Tokenizer::parse()
+Vector<Token> Tokenizer::tokenize()
 {
     Vector<Token> tokens;
     for (;;) {
diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
index c7d846c8ba..a204c02178 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.h
@@ -58,15 +58,16 @@ public:
 };
 
 class Tokenizer {
-
 public:
-    explicit Tokenizer(StringView input, StringView encoding);
-
-    [[nodiscard]] Vector<Token> parse();
+    static ErrorOr<Vector<Token>> tokenize(StringView input, StringView encoding);
 
     [[nodiscard]] static Token create_eof_token();
 
 private:
+    explicit Tokenizer(String decoded_input);
+
+    [[nodiscard]] Vector<Token> tokenize();
+
     [[nodiscard]] u32 next_code_point();
     [[nodiscard]] u32 peek_code_point(size_t offset = 0) const;
     [[nodiscard]] U32Twin peek_twin() const;
diff --git a/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp b/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp
index 39f207f61f..fc7950edf3 100644
--- a/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp
+++ b/Userland/Libraries/LibWeb/CSS/SyntaxHighlighter/SyntaxHighlighter.cpp
@@ -45,8 +45,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)
             false);
     };
 
-    CSS::Parser::Tokenizer tokenizer { text, "utf-8"sv };
-    auto tokens = tokenizer.parse();
+    auto tokens = CSS::Parser::Tokenizer::tokenize(text, "utf-8"sv).release_value_but_fixme_should_propagate_errors();
     for (auto const& token : tokens) {
         if (token.is(Parser::Token::Type::EndOfFile))
             break;
author	Sam Atkins <atkinssj@serenityos.org>	2023-03-06 14:19:39 +0000
committer	Andreas Kling <kling@serenityos.org>	2023-03-07 00:43:36 +0100
commit	17618989a39c01656ac1e23c015c5803a9542314 (patch)
tree	7f85e8ed3c05b226e877719cde709d72389829ef /Userland
parent	98ee2fcd1beddbfd58fd1f17b3595a8e4d0ae414 (diff)
download	serenity-17618989a39c01656ac1e23c015c5803a9542314.zip