LibWeb: Implement spec-compliant MIME type struct and parser

This will be used by XHR to extract the Content-Type MIME type to retrieve the charset.
author: Luke Wilde <lukew@serenityos.org> 2022-02-11 20:53:47 +0000
committer: Andreas Kling <kling@serenityos.org> 2022-02-12 12:53:28 +0100
commit: 8cfeca526172a40f8dd6026ff8cfec129e930735 (patch)
tree: ad2b86b4e5e33a2b1b8a4a25ff9209035aced38d /Userland/Libraries/LibWeb
parent: 2903c47ba068a5a22735411ec445efbfda53bbe1 (diff)
download: serenity-8cfeca526172a40f8dd6026ff8cfec129e930735.zip
6 files changed, 342 insertions, 0 deletions
diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt
index 4fa30b1b47..06808c9c79 100644
--- a/Userland/Libraries/LibWeb/CMakeLists.txt
+++ b/Userland/Libraries/LibWeb/CMakeLists.txt
@@ -93,6 +93,7 @@ set(SOURCES
     DOMTreeModel.cpp
     Dump.cpp
     Encoding/TextEncoder.cpp
+    Fetch/AbstractOperations.cpp
     FontCache.cpp
     HTML/AttributeNames.cpp
     HTML/BrowsingContext.cpp
@@ -252,6 +253,7 @@ set(SOURCES
     Loader/LoadRequest.cpp
     Loader/Resource.cpp
     Loader/ResourceLoader.cpp
+    MimeSniff/MimeType.cpp
     Namespace.cpp
     NavigationTiming/PerformanceTiming.cpp
     OutOfProcessWebView.cpp
diff --git a/Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp
new file mode 100644
index 0000000000..7a1cdb1b35
--- /dev/null
+++ b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/GenericLexer.h>
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <LibWeb/Fetch/AbstractOperations.h>
+
+namespace Web::Fetch {
+
+// https://fetch.spec.whatwg.org/#collect-an-http-quoted-string
+String collect_an_http_quoted_string(GenericLexer& lexer, HttpQuotedStringExtractValue extract_value)
+{
+    // To collect an HTTP quoted string from a string input, given a position variable position and optionally an extract-value flag, run these steps:
+    // 1. Let positionStart be position.
+    auto position_start = lexer.tell();
+
+    // 2. Let value be the empty string.
+    StringBuilder value;
+
+    // 3. Assert: the code point at position within input is U+0022 (").
+    VERIFY(lexer.peek() == '"');
+
+    // 4. Advance position by 1.
+    lexer.ignore(1);
+
+    // 5. While true:
+    while (true) {
+        // 1. Append the result of collecting a sequence of code points that are not U+0022 (") or U+005C (\) from input, given position, to value.
+        auto value_part = lexer.consume_until([](char ch) {
+            return ch == '"' || ch == '\\';
+        });
+
+        value.append(value_part);
+
+        // 2. If position is past the end of input, then break.
+        if (lexer.is_eof())
+            break;
+
+        // 3. Let quoteOrBackslash be the code point at position within input.
+        // 4. Advance position by 1.
+        char quote_or_backslash = lexer.consume();
+
+        // 5. If quoteOrBackslash is U+005C (\), then:
+        if (quote_or_backslash == '\\') {
+            // 1. If position is past the end of input, then append U+005C (\) to value and break.
+            if (lexer.is_eof()) {
+                value.append('\\');
+                break;
+            }
+
+            // 2. Append the code point at position within input to value.
+            // 3. Advance position by 1.
+            value.append(lexer.consume());
+        }
+
+        // 6. Otherwise:
+        else {
+            // 1. Assert: quoteOrBackslash is U+0022 (").
+            VERIFY(quote_or_backslash == '"');
+
+            // 2. Break.
+            break;
+        }
+    }
+
+    // 6. If the extract-value flag is set, then return value.
+    if (extract_value == HttpQuotedStringExtractValue::Yes)
+        return value.to_string();
+
+    // 7. Return the code points from positionStart to position, inclusive, within input.
+    auto position = lexer.tell();
+    auto number_of_characters_to_consume = position - position_start + 1;
+    lexer.retreat(number_of_characters_to_consume);
+    return lexer.consume(number_of_characters_to_consume);
+}
+
+}
diff --git a/Userland/Libraries/LibWeb/Fetch/AbstractOperations.h b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.h
new file mode 100644
index 0000000000..7ab93cd993
--- /dev/null
+++ b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/Forward.h>
+
+namespace Web::Fetch {
+
+enum class HttpQuotedStringExtractValue {
+    No,
+    Yes,
+};
+
+String collect_an_http_quoted_string(GenericLexer& lexer, HttpQuotedStringExtractValue extract_value);
+
+}
diff --git a/Userland/Libraries/LibWeb/Forward.h b/Userland/Libraries/LibWeb/Forward.h
index f2d019fd31..7b9451cbd0 100644
--- a/Userland/Libraries/LibWeb/Forward.h
+++ b/Userland/Libraries/LibWeb/Forward.h
@@ -231,6 +231,10 @@ namespace Web::IntersectionObserver {
 class IntersectionObserver;
 }
 
+namespace Web::MimeSniff {
+class MimeType;
+}
+
 namespace Web::NavigationTiming {
 class PerformanceTiming;
 }
diff --git a/Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp b/Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp
new file mode 100644
index 0000000000..c3c36904dd
--- /dev/null
+++ b/Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/CharacterTypes.h>
+#include <AK/GenericLexer.h>
+#include <AK/StringBuilder.h>
+#include <LibWeb/Fetch/AbstractOperations.h>
+#include <LibWeb/MimeSniff/MimeType.h>
+
+namespace Web::MimeSniff {
+
+static bool contains_only_http_quoted_string_token_code_points(StringView string)
+{
+    // https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point
+    // An HTTP quoted-string token code point is U+0009 TAB, a code point in the range U+0020 SPACE to U+007E (~), inclusive,
+    // or a code point in the range U+0080 through U+00FF (ÿ), inclusive.
+    for (char ch : string) {
+        // NOTE: This doesn't check for ch <= 0xFF, as ch is 8-bits and so that condition will always be true.
+        if (!(ch == '\t' || (ch >= 0x20 && ch <= 0x7E) || (u8)ch >= 0x80))
+            return false;
+    }
+    return true;
+}
+
+MimeType::MimeType(String type, String subtype)
+    : m_type(type)
+    , m_subtype(subtype)
+{
+    // https://mimesniff.spec.whatwg.org/#parameters
+    // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points.
+    VERIFY(contains_only_http_quoted_string_token_code_points(type));
+    VERIFY(contains_only_http_quoted_string_token_code_points(subtype));
+}
+
+MimeType::~MimeType()
+{
+}
+
+static bool contains_only_http_token_code_points(StringView string)
+{
+    // https://mimesniff.spec.whatwg.org/#http-token-code-point
+    // An HTTP token code point is U+0021 (!), U+0023 (#), U+0024 ($), U+0025 (%), U+0026 (&), U+0027 ('), U+002A (*),
+    // U+002B (+), U+002D (-), U+002E (.), U+005E (^), U+005F (_), U+0060 (`), U+007C (|), U+007E (~), or an ASCII alphanumeric.
+    constexpr auto is_certain_non_ascii_alphanumeric = is_any_of("!#$%&'*+-.^_`|~");
+    for (char ch : string) {
+        if (!is_certain_non_ascii_alphanumeric(ch) && !is_ascii_alphanumeric(ch))
+            return false;
+    }
+    return true;
+}
+
+// https://mimesniff.spec.whatwg.org/#parse-a-mime-type
+Optional<MimeType> MimeType::from_string(StringView string)
+{
+    // https://fetch.spec.whatwg.org/#http-whitespace
+    // HTTP whitespace is U+000A LF, U+000D CR, or an HTTP tab or space.
+    // An HTTP tab or space is U+0009 TAB or U+0020 SPACE.
+    constexpr const char* http_whitespace = "\n\r\t ";
+
+    // 1. Remove any leading and trailing HTTP whitespace from input.
+    auto trimmed_string = string.trim(http_whitespace, TrimMode::Both);
+
+    // 2. Let position be a position variable for input, initially pointing at the start of input.
+    GenericLexer lexer(trimmed_string);
+
+    // 3. Let type be the result of collecting a sequence of code points that are not U+002F (/) from input, given position.
+    auto type = lexer.consume_until('/');
+
+    // 4. If type is the empty string or does not solely contain HTTP token code points, then return failure.
+    if (type.is_empty() || !contains_only_http_token_code_points(type))
+        return {};
+
+    // 5. If position is past the end of input, then return failure.
+    if (lexer.is_eof())
+        return {};
+
+    // 6. Advance position by 1. (This skips past U+002F (/).)
+    lexer.ignore(1);
+
+    // 7. Let subtype be the result of collecting a sequence of code points that are not U+003B (;) from input, given position.
+    auto subtype = lexer.consume_until(';');
+
+    // 8. Remove any trailing HTTP whitespace from subtype.
+    subtype = subtype.trim(http_whitespace, TrimMode::Right);
+
+    // 9. If subtype is the empty string or does not solely contain HTTP token code points, then return failure.
+    if (subtype.is_empty() || !contains_only_http_token_code_points(subtype))
+        return {};
+
+    // 10. Let mimeType be a new MIME type record whose type is type, in ASCII lowercase, and subtype is subtype, in ASCII lowercase.
+    auto mime_type = MimeType(type.to_lowercase_string(), subtype.to_lowercase_string());
+
+    // 11. While position is not past the end of input:
+    while (!lexer.is_eof()) {
+        // 1. Advance position by 1. (This skips past U+003B (;).)
+        lexer.ignore(1);
+
+        // 2. Collect a sequence of code points that are HTTP whitespace from input given position.
+        lexer.ignore_while(is_any_of(http_whitespace));
+
+        // 3. Let parameterName be the result of collecting a sequence of code points that are not U+003B (;) or U+003D (=) from input, given position.
+        auto parameter_name = lexer.consume_until([](char ch) {
+            return ch == ';' || ch == '=';
+        });
+
+        // 4. Set parameterName to parameterName, in ASCII lowercase.
+        // NOTE: Reassigning to parameter_name here causes a UAF when trying to use parameter_name down the road.
+        auto lowercase_parameter_name = parameter_name.to_lowercase_string();
+
+        // 5. If position is not past the end of input, then:
+        if (!lexer.is_eof()) {
+            // 1. If the code point at position within input is U+003B (;), then continue.
+            if (lexer.peek() == ';')
+                continue;
+
+            // 2. Advance position by 1. (This skips past U+003D (=).)
+            lexer.ignore(1);
+        }
+
+        // 6. If position is past the end of input, then break.
+        // NOTE: This is not an `else` because the ignore on step 11.5.2 could put us past the end of the input.
+        if (lexer.is_eof())
+            break;
+
+        // 7. Let parameterValue be null.
+        String parameter_value;
+
+        // 8. If the code point at position within input is U+0022 ("), then:
+        if (lexer.tell() == '"') {
+            // 1. Set parameterValue to the result of collecting an HTTP quoted string from input, given position and the extract-value flag.
+            parameter_value = collect_an_http_quoted_string(lexer, Fetch::HttpQuotedStringExtractValue::Yes);
+
+            // 2. Collect a sequence of code points that are not U+003B (;) from input, given position.
+            // NOTE: This uses the predicate version as the ignore_until(char) version will also ignore the ';'.
+            lexer.ignore_until([](char ch) {
+                return ch == ';';
+            });
+        }
+
+        // 9. Otherwise:
+        else {
+            // 1. Set parameterValue to the result of collecting a sequence of code points that are not U+003B (;) from input, given position.
+            parameter_value = lexer.consume_until(';');
+
+            // 2. Remove any trailing HTTP whitespace from parameterValue.
+            parameter_value = parameter_value.trim(http_whitespace, TrimMode::Right);
+
+            // 3. If parameterValue is the empty string, then continue.
+            if (parameter_value.is_empty())
+                continue;
+        }
+
+        // 10. If all of the following are true
+        //       - parameterName is not the empty string
+        //       - parameterName solely contains HTTP token code points
+        //       - parameterValue solely contains HTTP quoted-string token code points
+        //       - mimeType’s parameters[parameterName] does not exist
+        //     then set mimeType’s parameters[parameterName] to parameterValue.
+        if (!parameter_name.is_empty()
+            && contains_only_http_token_code_points(lowercase_parameter_name)
+            && contains_only_http_quoted_string_token_code_points(parameter_value)
+            && !mime_type.m_parameters.contains(lowercase_parameter_name)) {
+            mime_type.m_parameters.set(lowercase_parameter_name, parameter_value);
+        }
+    }
+
+    // 12. Return mimeType.
+    return Optional<MimeType> { move(mime_type) };
+}
+
+// https://mimesniff.spec.whatwg.org/#mime-type-essence
+String MimeType::essence() const
+{
+    // The essence of a MIME type mimeType is mimeType’s type, followed by U+002F (/), followed by mimeType’s subtype.
+    // FIXME: I believe this can easily be cached as I don't think anything directly changes the type and subtype.
+    return String::formatted("{}/{}", m_type, m_subtype);
+}
+
+void MimeType::set_parameter(String const& name, String const& value)
+{
+    // https://mimesniff.spec.whatwg.org/#parameters
+    // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points.
+    VERIFY(contains_only_http_quoted_string_token_code_points(name));
+    VERIFY(contains_only_http_quoted_string_token_code_points(value));
+    m_parameters.set(name, value);
+}
+
+}
diff --git a/Userland/Libraries/LibWeb/MimeSniff/MimeType.h b/Userland/Libraries/LibWeb/MimeSniff/MimeType.h
new file mode 100644
index 0000000000..e90bc237ce
--- /dev/null
+++ b/Userland/Libraries/LibWeb/MimeSniff/MimeType.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/HashMap.h>
+#include <AK/String.h>
+
+namespace Web::MimeSniff {
+
+// https://mimesniff.spec.whatwg.org/#mime-type
+class MimeType {
+public:
+    static Optional<MimeType> from_string(StringView);
+
+    MimeType(String type, String subtype);
+    ~MimeType();
+
+    String const& type() const { return m_type; }
+    String const& subtype() const { return m_subtype; }
+    OrderedHashMap<String, String> const& parameters() const { return m_parameters; }
+
+    void set_parameter(String const& name, String const& value);
+
+    String essence() const;
+
+private:
+    // https://mimesniff.spec.whatwg.org/#type
+    // A MIME type’s type is a non-empty ASCII string.
+    String m_type;
+
+    // https://mimesniff.spec.whatwg.org/#subtype
+    // A MIME type’s subtype is a non-empty ASCII string.
+    String m_subtype;
+
+    // https://mimesniff.spec.whatwg.org/#parameters
+    // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points. It is initially empty.
+    OrderedHashMap<String, String> m_parameters;
+};
+
+}
author	Luke Wilde <lukew@serenityos.org>	2022-02-11 20:53:47 +0000
committer	Andreas Kling <kling@serenityos.org>	2022-02-12 12:53:28 +0100
commit	8cfeca526172a40f8dd6026ff8cfec129e930735 (patch)
tree	ad2b86b4e5e33a2b1b8a4a25ff9209035aced38d /Userland/Libraries/LibWeb
parent	2903c47ba068a5a22735411ec445efbfda53bbe1 (diff)
download	serenity-8cfeca526172a40f8dd6026ff8cfec129e930735.zip