diff options
author | Luke Wilde <lukew@serenityos.org> | 2022-02-11 20:53:47 +0000 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2022-02-12 12:53:28 +0100 |
commit | 8cfeca526172a40f8dd6026ff8cfec129e930735 (patch) | |
tree | ad2b86b4e5e33a2b1b8a4a25ff9209035aced38d /Userland/Libraries/LibWeb | |
parent | 2903c47ba068a5a22735411ec445efbfda53bbe1 (diff) | |
download | serenity-8cfeca526172a40f8dd6026ff8cfec129e930735.zip |
LibWeb: Implement spec-compliant MIME type struct and parser
This will be used by XHR to extract the Content-Type MIME type to
retrieve the charset.
Diffstat (limited to 'Userland/Libraries/LibWeb')
-rw-r--r-- | Userland/Libraries/LibWeb/CMakeLists.txt | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp | 81 | ||||
-rw-r--r-- | Userland/Libraries/LibWeb/Fetch/AbstractOperations.h | 20 | ||||
-rw-r--r-- | Userland/Libraries/LibWeb/Forward.h | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp | 191 | ||||
-rw-r--r-- | Userland/Libraries/LibWeb/MimeSniff/MimeType.h | 44 |
6 files changed, 342 insertions, 0 deletions
diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index 4fa30b1b47..06808c9c79 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -93,6 +93,7 @@ set(SOURCES DOMTreeModel.cpp Dump.cpp Encoding/TextEncoder.cpp + Fetch/AbstractOperations.cpp FontCache.cpp HTML/AttributeNames.cpp HTML/BrowsingContext.cpp @@ -252,6 +253,7 @@ set(SOURCES Loader/LoadRequest.cpp Loader/Resource.cpp Loader/ResourceLoader.cpp + MimeSniff/MimeType.cpp Namespace.cpp NavigationTiming/PerformanceTiming.cpp OutOfProcessWebView.cpp diff --git a/Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp new file mode 100644 index 0000000000..7a1cdb1b35 --- /dev/null +++ b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.cpp @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/GenericLexer.h> +#include <AK/String.h> +#include <AK/StringBuilder.h> +#include <LibWeb/Fetch/AbstractOperations.h> + +namespace Web::Fetch { + +// https://fetch.spec.whatwg.org/#collect-an-http-quoted-string +String collect_an_http_quoted_string(GenericLexer& lexer, HttpQuotedStringExtractValue extract_value) +{ + // To collect an HTTP quoted string from a string input, given a position variable position and optionally an extract-value flag, run these steps: + // 1. Let positionStart be position. + auto position_start = lexer.tell(); + + // 2. Let value be the empty string. + StringBuilder value; + + // 3. Assert: the code point at position within input is U+0022 ("). + VERIFY(lexer.peek() == '"'); + + // 4. Advance position by 1. + lexer.ignore(1); + + // 5. While true: + while (true) { + // 1. Append the result of collecting a sequence of code points that are not U+0022 (") or U+005C (\) from input, given position, to value. + auto value_part = lexer.consume_until([](char ch) { + return ch == '"' || ch == '\\'; + }); + + value.append(value_part); + + // 2. If position is past the end of input, then break. + if (lexer.is_eof()) + break; + + // 3. Let quoteOrBackslash be the code point at position within input. + // 4. Advance position by 1. + char quote_or_backslash = lexer.consume(); + + // 5. If quoteOrBackslash is U+005C (\), then: + if (quote_or_backslash == '\\') { + // 1. If position is past the end of input, then append U+005C (\) to value and break. + if (lexer.is_eof()) { + value.append('\\'); + break; + } + + // 2. Append the code point at position within input to value. + // 3. Advance position by 1. + value.append(lexer.consume()); + } + + // 6. Otherwise: + else { + // 1. Assert: quoteOrBackslash is U+0022 ("). + VERIFY(quote_or_backslash == '"'); + + // 2. Break. + break; + } + } + + // 6. If the extract-value flag is set, then return value. + if (extract_value == HttpQuotedStringExtractValue::Yes) + return value.to_string(); + + // 7. Return the code points from positionStart to position, inclusive, within input. + auto position = lexer.tell(); + auto number_of_characters_to_consume = position - position_start + 1; + lexer.retreat(number_of_characters_to_consume); + return lexer.consume(number_of_characters_to_consume); +} + +} diff --git a/Userland/Libraries/LibWeb/Fetch/AbstractOperations.h b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.h new file mode 100644 index 0000000000..7ab93cd993 --- /dev/null +++ b/Userland/Libraries/LibWeb/Fetch/AbstractOperations.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/Forward.h> + +namespace Web::Fetch { + +enum class HttpQuotedStringExtractValue { + No, + Yes, +}; + +String collect_an_http_quoted_string(GenericLexer& lexer, HttpQuotedStringExtractValue extract_value); + +} diff --git a/Userland/Libraries/LibWeb/Forward.h b/Userland/Libraries/LibWeb/Forward.h index f2d019fd31..7b9451cbd0 100644 --- a/Userland/Libraries/LibWeb/Forward.h +++ b/Userland/Libraries/LibWeb/Forward.h @@ -231,6 +231,10 @@ namespace Web::IntersectionObserver { class IntersectionObserver; } +namespace Web::MimeSniff { +class MimeType; +} + namespace Web::NavigationTiming { class PerformanceTiming; } diff --git a/Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp b/Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp new file mode 100644 index 0000000000..c3c36904dd --- /dev/null +++ b/Userland/Libraries/LibWeb/MimeSniff/MimeType.cpp @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/CharacterTypes.h> +#include <AK/GenericLexer.h> +#include <AK/StringBuilder.h> +#include <LibWeb/Fetch/AbstractOperations.h> +#include <LibWeb/MimeSniff/MimeType.h> + +namespace Web::MimeSniff { + +static bool contains_only_http_quoted_string_token_code_points(StringView string) +{ + // https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point + // An HTTP quoted-string token code point is U+0009 TAB, a code point in the range U+0020 SPACE to U+007E (~), inclusive, + // or a code point in the range U+0080 through U+00FF (ÿ), inclusive. + for (char ch : string) { + // NOTE: This doesn't check for ch <= 0xFF, as ch is 8-bits and so that condition will always be true. + if (!(ch == '\t' || (ch >= 0x20 && ch <= 0x7E) || (u8)ch >= 0x80)) + return false; + } + return true; +} + +MimeType::MimeType(String type, String subtype) + : m_type(type) + , m_subtype(subtype) +{ + // https://mimesniff.spec.whatwg.org/#parameters + // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points. + VERIFY(contains_only_http_quoted_string_token_code_points(type)); + VERIFY(contains_only_http_quoted_string_token_code_points(subtype)); +} + +MimeType::~MimeType() +{ +} + +static bool contains_only_http_token_code_points(StringView string) +{ + // https://mimesniff.spec.whatwg.org/#http-token-code-point + // An HTTP token code point is U+0021 (!), U+0023 (#), U+0024 ($), U+0025 (%), U+0026 (&), U+0027 ('), U+002A (*), + // U+002B (+), U+002D (-), U+002E (.), U+005E (^), U+005F (_), U+0060 (`), U+007C (|), U+007E (~), or an ASCII alphanumeric. + constexpr auto is_certain_non_ascii_alphanumeric = is_any_of("!#$%&'*+-.^_`|~"); + for (char ch : string) { + if (!is_certain_non_ascii_alphanumeric(ch) && !is_ascii_alphanumeric(ch)) + return false; + } + return true; +} + +// https://mimesniff.spec.whatwg.org/#parse-a-mime-type +Optional<MimeType> MimeType::from_string(StringView string) +{ + // https://fetch.spec.whatwg.org/#http-whitespace + // HTTP whitespace is U+000A LF, U+000D CR, or an HTTP tab or space. + // An HTTP tab or space is U+0009 TAB or U+0020 SPACE. + constexpr const char* http_whitespace = "\n\r\t "; + + // 1. Remove any leading and trailing HTTP whitespace from input. + auto trimmed_string = string.trim(http_whitespace, TrimMode::Both); + + // 2. Let position be a position variable for input, initially pointing at the start of input. + GenericLexer lexer(trimmed_string); + + // 3. Let type be the result of collecting a sequence of code points that are not U+002F (/) from input, given position. + auto type = lexer.consume_until('/'); + + // 4. If type is the empty string or does not solely contain HTTP token code points, then return failure. + if (type.is_empty() || !contains_only_http_token_code_points(type)) + return {}; + + // 5. If position is past the end of input, then return failure. + if (lexer.is_eof()) + return {}; + + // 6. Advance position by 1. (This skips past U+002F (/).) + lexer.ignore(1); + + // 7. Let subtype be the result of collecting a sequence of code points that are not U+003B (;) from input, given position. + auto subtype = lexer.consume_until(';'); + + // 8. Remove any trailing HTTP whitespace from subtype. + subtype = subtype.trim(http_whitespace, TrimMode::Right); + + // 9. If subtype is the empty string or does not solely contain HTTP token code points, then return failure. + if (subtype.is_empty() || !contains_only_http_token_code_points(subtype)) + return {}; + + // 10. Let mimeType be a new MIME type record whose type is type, in ASCII lowercase, and subtype is subtype, in ASCII lowercase. + auto mime_type = MimeType(type.to_lowercase_string(), subtype.to_lowercase_string()); + + // 11. While position is not past the end of input: + while (!lexer.is_eof()) { + // 1. Advance position by 1. (This skips past U+003B (;).) + lexer.ignore(1); + + // 2. Collect a sequence of code points that are HTTP whitespace from input given position. + lexer.ignore_while(is_any_of(http_whitespace)); + + // 3. Let parameterName be the result of collecting a sequence of code points that are not U+003B (;) or U+003D (=) from input, given position. + auto parameter_name = lexer.consume_until([](char ch) { + return ch == ';' || ch == '='; + }); + + // 4. Set parameterName to parameterName, in ASCII lowercase. + // NOTE: Reassigning to parameter_name here causes a UAF when trying to use parameter_name down the road. + auto lowercase_parameter_name = parameter_name.to_lowercase_string(); + + // 5. If position is not past the end of input, then: + if (!lexer.is_eof()) { + // 1. If the code point at position within input is U+003B (;), then continue. + if (lexer.peek() == ';') + continue; + + // 2. Advance position by 1. (This skips past U+003D (=).) + lexer.ignore(1); + } + + // 6. If position is past the end of input, then break. + // NOTE: This is not an `else` because the ignore on step 11.5.2 could put us past the end of the input. + if (lexer.is_eof()) + break; + + // 7. Let parameterValue be null. + String parameter_value; + + // 8. If the code point at position within input is U+0022 ("), then: + if (lexer.tell() == '"') { + // 1. Set parameterValue to the result of collecting an HTTP quoted string from input, given position and the extract-value flag. + parameter_value = collect_an_http_quoted_string(lexer, Fetch::HttpQuotedStringExtractValue::Yes); + + // 2. Collect a sequence of code points that are not U+003B (;) from input, given position. + // NOTE: This uses the predicate version as the ignore_until(char) version will also ignore the ';'. + lexer.ignore_until([](char ch) { + return ch == ';'; + }); + } + + // 9. Otherwise: + else { + // 1. Set parameterValue to the result of collecting a sequence of code points that are not U+003B (;) from input, given position. + parameter_value = lexer.consume_until(';'); + + // 2. Remove any trailing HTTP whitespace from parameterValue. + parameter_value = parameter_value.trim(http_whitespace, TrimMode::Right); + + // 3. If parameterValue is the empty string, then continue. + if (parameter_value.is_empty()) + continue; + } + + // 10. If all of the following are true + // - parameterName is not the empty string + // - parameterName solely contains HTTP token code points + // - parameterValue solely contains HTTP quoted-string token code points + // - mimeType’s parameters[parameterName] does not exist + // then set mimeType’s parameters[parameterName] to parameterValue. + if (!parameter_name.is_empty() + && contains_only_http_token_code_points(lowercase_parameter_name) + && contains_only_http_quoted_string_token_code_points(parameter_value) + && !mime_type.m_parameters.contains(lowercase_parameter_name)) { + mime_type.m_parameters.set(lowercase_parameter_name, parameter_value); + } + } + + // 12. Return mimeType. + return Optional<MimeType> { move(mime_type) }; +} + +// https://mimesniff.spec.whatwg.org/#mime-type-essence +String MimeType::essence() const +{ + // The essence of a MIME type mimeType is mimeType’s type, followed by U+002F (/), followed by mimeType’s subtype. + // FIXME: I believe this can easily be cached as I don't think anything directly changes the type and subtype. + return String::formatted("{}/{}", m_type, m_subtype); +} + +void MimeType::set_parameter(String const& name, String const& value) +{ + // https://mimesniff.spec.whatwg.org/#parameters + // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points. + VERIFY(contains_only_http_quoted_string_token_code_points(name)); + VERIFY(contains_only_http_quoted_string_token_code_points(value)); + m_parameters.set(name, value); +} + +} diff --git a/Userland/Libraries/LibWeb/MimeSniff/MimeType.h b/Userland/Libraries/LibWeb/MimeSniff/MimeType.h new file mode 100644 index 0000000000..e90bc237ce --- /dev/null +++ b/Userland/Libraries/LibWeb/MimeSniff/MimeType.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include <AK/HashMap.h> +#include <AK/String.h> + +namespace Web::MimeSniff { + +// https://mimesniff.spec.whatwg.org/#mime-type +class MimeType { +public: + static Optional<MimeType> from_string(StringView); + + MimeType(String type, String subtype); + ~MimeType(); + + String const& type() const { return m_type; } + String const& subtype() const { return m_subtype; } + OrderedHashMap<String, String> const& parameters() const { return m_parameters; } + + void set_parameter(String const& name, String const& value); + + String essence() const; + +private: + // https://mimesniff.spec.whatwg.org/#type + // A MIME type’s type is a non-empty ASCII string. + String m_type; + + // https://mimesniff.spec.whatwg.org/#subtype + // A MIME type’s subtype is a non-empty ASCII string. + String m_subtype; + + // https://mimesniff.spec.whatwg.org/#parameters + // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points. It is initially empty. + OrderedHashMap<String, String> m_parameters; +}; + +} |