AK: Add a new, spec-compliant URLParser

This adds a new URL parser, which aims to be compliant with the URL specification (https://url.spec.whatwg.org/). It also contains a rudimentary data URL parser.
author: Max Wipfli <mail@maxwipfli.ch> 2021-05-25 22:13:15 +0200
committer: Andreas Kling <kling@serenityos.org> 2021-06-01 09:28:05 +0200
commit: 0d0ed4962f066bd390d55d25072c34532ba6db93 (patch)
tree: 6fc966b3cd339b576ab145d6ea4ae33c60ca4e0c /AK/URLParser.cpp
parent: 8a938a3e25951e3bf0ad833256d78c9188819a66 (diff)
download: serenity-0d0ed4962f066bd390d55d25072c34532ba6db93.zip
1 files changed, 688 insertions, 0 deletions
diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp
new file mode 100644
index 0000000000..8b5a9ffb6f
--- /dev/null
+++ b/AK/URLParser.cpp
@@ -0,0 +1,688 @@
+/*
+ * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Debug.h>
+#include <AK/Optional.h>
+#include <AK/SourceLocation.h>
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <AK/StringUtils.h>
+#include <AK/URLParser.h>
+#include <AK/Utf8View.h>
+
+namespace AK {
+
+constexpr bool is_ascii_alpha(u32 code_point)
+{
+    return ('a' <= code_point && code_point <= 'z') || ('A' <= code_point && code_point <= 'Z');
+}
+
+constexpr bool is_ascii_digit(u32 code_point)
+{
+    return '0' <= code_point && code_point <= '9';
+}
+
+constexpr bool is_ascii_alphanumeric(u32 code_point)
+{
+    return is_ascii_alpha(code_point) || is_ascii_digit(code_point);
+}
+
+constexpr bool is_ascii_hex_digit(u32 code_point)
+{
+    return is_ascii_digit(code_point) || (code_point >= 'a' && code_point <= 'f') || (code_point >= 'A' && code_point <= 'F');
+}
+
+constexpr bool is_url_code_point(u32 code_point)
+{
+    // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
+    return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
+}
+
+static void report_validation_error(const SourceLocation& location = SourceLocation::current())
+{
+    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
+}
+
+static Optional<String> parse_opaque_host(const StringView& input)
+{
+    auto forbidden_host_code_points_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
+    for (auto code_point : forbidden_host_code_points_excluding_percent) {
+        if (input.contains(code_point)) {
+            report_validation_error();
+            return {};
+        }
+    }
+    // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
+    // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
+    return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
+}
+
+static Optional<String> parse_ipv4_address(const StringView& input)
+{
+    // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
+    return input;
+}
+
+// https://url.spec.whatwg.org/#concept-host-parser
+// NOTE: This is a very bare-bones implementation.
+static Optional<String> parse_host(const StringView& input, bool is_not_special = false)
+{
+    if (input.starts_with('[')) {
+        if (!input.ends_with(']')) {
+            report_validation_error();
+            return {};
+        }
+        // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
+        TODO();
+    }
+
+    if (is_not_special)
+        return parse_opaque_host(input);
+    VERIFY(!input.is_empty());
+
+    // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
+    auto domain = URL::percent_decode(input);
+    // FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
+    auto& ascii_domain = domain;
+
+    auto forbidden_host_code_points = "\0\t\n\r #%/:<>?@[\\]^|"sv;
+    for (auto code_point : forbidden_host_code_points) {
+        if (ascii_domain.view().contains(code_point)) {
+            report_validation_error();
+            return {};
+        }
+    }
+
+    auto ipv4_host = parse_ipv4_address(ascii_domain);
+    return ipv4_host;
+}
+
+constexpr bool starts_with_windows_drive_letter(const StringView& input)
+{
+    if (input.length() < 2)
+        return false;
+    if (!is_ascii_alpha(input[0]) && !(input[1] == ':' || input[1] == '|'))
+        return false;
+    if (input.length() == 2)
+        return true;
+    return "/\\?#"sv.contains(input[2]);
+}
+
+constexpr bool is_windows_drive_letter(const StringView& input)
+{
+    return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
+}
+
+constexpr bool is_normalized_windows_drive_letter(const StringView& input)
+{
+    return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
+}
+
+constexpr bool is_single_dot_path_segment(const StringView& input)
+{
+    return input == "."sv || input.equals_ignoring_case("%2e"sv);
+}
+
+constexpr bool is_double_dot_path_segment(const StringView& input)
+{
+    return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv);
+}
+
+// https://fetch.spec.whatwg.org/#data-urls
+// FIXME: This only loosely follow the spec, as we use the same class for "regular" and data URLs, unlike the spec.
+Optional<URL> URLParser::parse_data_url(const StringView& raw_input)
+{
+    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
+    VERIFY(raw_input.starts_with("data:"));
+    auto input = raw_input.substring_view(5);
+    auto comma_offset = input.find(',');
+    if (!comma_offset.has_value())
+        return {};
+    auto mime_type = input.substring_view(0, comma_offset.value());
+    // FIXME: Strip leading and trailing ASCII whitespace from mimeType
+    auto encoded_body = input.substring_view(comma_offset.value() + 1);
+    auto body = URL::percent_decode(encoded_body);
+    bool is_base_64_encoded = false;
+    if (mime_type.ends_with(";base64", CaseSensitivity::CaseInsensitive)) {
+        is_base_64_encoded = true;
+        mime_type = mime_type.substring_view(0, mime_type.length() - 7);
+    }
+
+    if (mime_type.starts_with(";")) {
+        StringBuilder builder;
+        builder.append("text/plain");
+        builder.append(mime_type);
+        mime_type = builder.to_string();
+    }
+
+    URL url { mime_type, move(body), is_base_64_encoded };
+    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
+    return url;
+}
+
+// https://url.spec.whatwg.org/#concept-basic-url-parser
+// NOTE: This parser assumes a UTF-8 encoding.
+// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
+//       validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
+//       future for validation of URLs, which would then lead to infinite recursion.
+//       The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
+//       variables' values here, not what the URL class presents to its users.
+// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
+//       some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
+//       everything before setting the member variables.
+URL URLParser::parse(Badge<URL>, const StringView& raw_input, URL const* base_url)
+{
+    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
+    if (raw_input.is_empty())
+        return {};
+
+    if (raw_input.starts_with("data:")) {
+        auto maybe_url = parse_data_url(raw_input);
+        if (!maybe_url.has_value())
+            return {};
+        return maybe_url.release_value();
+    }
+
+    URL url;
+
+    // NOTE: This removes all leading and trailing C0 control or space characters.
+    bool has_validation_error = false;
+    size_t start_index = 0;
+    size_t end_index = raw_input.length();
+    for (size_t i = 0; i < raw_input.length(); ++i) {
+        if (raw_input[i] <= 0x20) {
+            ++start_index;
+            has_validation_error = true;
+        }
+    }
+    for (size_t i = 0; i < raw_input.length(); ++i) {
+        if (raw_input[raw_input.length() - 1 - i] <= 0x20) {
+            --end_index;
+            has_validation_error = true;
+        }
+    }
+    if (has_validation_error)
+        report_validation_error();
+    if (start_index >= end_index)
+        return {};
+
+    auto processed_input = raw_input.substring_view(start_index, end_index - start_index);
+
+    // NOTE: This replaces all tab and newline characters with nothing.
+    if (processed_input.contains("\t") || processed_input.contains("\n")) {
+        report_validation_error();
+        String processed_input_string(processed_input);
+        processed_input_string.replace("\t", "", true);
+        processed_input_string.replace("\n", "", true);
+        processed_input = processed_input_string;
+    }
+
+    State state = State::SchemeStart;
+    StringBuilder buffer;
+    bool at_sign_seen = false;
+    bool inside_brackets = false;
+    bool password_token_seen = false;
+
+    Utf8View input(processed_input);
+    Utf8CodepointIterator iterator = input.begin();
+
+    auto get_remaining = [&input, &iterator] {
+        return input.substring_view(iterator - input.begin() + iterator.code_point_length_in_bytes()).as_string();
+    };
+
+    // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
+    //       ++iterator : "increase pointer by 1"
+    //       continue   : "decrease pointer by 1"
+    // NOTE: The NULL code point is used as the "EOF code point".
+    for (;;) {
+        u32 code_point = 0;
+        if (!iterator.done())
+            code_point = *iterator;
+
+        if constexpr (URL_PARSER_DEBUG) {
+            if (code_point)
+                dbgln("URLParser::parse: State {:2d} with code point '{:c}' (U+{:04X}).", (int)state, code_point, code_point);
+            else
+                dbgln("URLParser::parse: State {:2d} with code point EOF (U+0000).", (int)state);
+        }
+
+        switch (state) {
+        case State::SchemeStart:
+            if (is_ascii_alpha(code_point)) {
+                buffer.append_as_lowercase(code_point);
+                state = State::Scheme;
+            } else {
+                state = State::NoScheme;
+                continue;
+            }
+            break;
+        case State::Scheme:
+            if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
+                buffer.append_as_lowercase(code_point);
+            } else if (code_point == ':') {
+                url.m_scheme = buffer.to_string();
+                buffer.clear();
+                if (url.scheme() == "file") {
+                    if (!get_remaining().starts_with("//")) {
+                        report_validation_error();
+                    }
+                    state = State::File;
+                } else if (url.is_special()) {
+                    if (base_url && base_url->m_scheme == url.m_scheme)
+                        state = State::SpecialRelativeOrAuthority;
+                    else
+                        state = State::SpecialAuthoritySlashes;
+                } else if (get_remaining().starts_with("/")) {
+                    state = State::PathOrAuthority;
+                    ++iterator;
+                } else {
+                    url.m_cannot_be_a_base_url = true;
+                    url.append_path("");
+                    state = State::CannotBeABaseUrlPath;
+                }
+            } else {
+                buffer.clear();
+                state = State::NoScheme;
+                iterator = input.begin();
+                continue;
+            }
+            break;
+        case State::NoScheme:
+            if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
+                report_validation_error();
+                return {};
+            } else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
+                url.m_scheme = base_url->m_scheme;
+                url.m_paths = base_url->m_paths;
+                url.m_query = base_url->m_query;
+                url.m_fragment = "";
+                url.m_cannot_be_a_base_url = true;
+                state = State::Fragment;
+            } else if (base_url->m_scheme != "file") {
+                state = State::Relative;
+                continue;
+            } else {
+                state = State::File;
+                continue;
+            }
+            break;
+        case State::SpecialRelativeOrAuthority:
+            if (code_point == '/' && get_remaining().starts_with("/")) {
+                state = State::SpecialAuthorityIgnoreSlashes;
+                ++iterator;
+            } else {
+                report_validation_error();
+                state = State::Relative;
+                continue;
+            }
+            break;
+        case State::PathOrAuthority:
+            if (code_point == '/') {
+                state = State::Authority;
+            } else {
+                state = State::Path;
+                continue;
+            }
+            break;
+        case State::Relative:
+            url.m_scheme = base_url->m_scheme;
+            if (code_point == '/') {
+                state = State::RelativeSlash;
+            } else if (url.is_special() && code_point == '\\') {
+                report_validation_error();
+                state = State::RelativeSlash;
+            } else {
+                url.m_username = base_url->m_username;
+                url.m_password = base_url->m_password;
+                url.m_host = base_url->m_host;
+                url.m_port = base_url->m_port;
+                url.m_paths = base_url->m_paths;
+                url.m_query = base_url->m_query;
+
+                if (code_point == '?') {
+                    url.m_query = "";
+                    state = State::Query;
+                } else if (code_point == '#') {
+                    url.m_fragment = "";
+                    state = State::Fragment;
+                } else if (code_point != 0) {
+                    url.m_query = {};
+                    if (url.m_paths.size())
+                        url.m_paths.remove(url.m_paths.size() - 1);
+                    state = State::Path;
+                    continue;
+                }
+            }
+            break;
+        case State::RelativeSlash:
+            if (url.is_special() && (code_point == '/' || code_point == '\\')) {
+                if (code_point == '\\')
+                    report_validation_error();
+                state = State::SpecialAuthorityIgnoreSlashes;
+            } else if (code_point == '/') {
+                state = State::Authority;
+            } else {
+                url.m_username = base_url->m_username;
+                url.m_password = base_url->m_password;
+                url.m_host = base_url->m_host;
+                url.m_port = base_url->m_port;
+                state = State::Path;
+                continue;
+            }
+            break;
+        case State::SpecialAuthoritySlashes:
+            if (code_point == '/' && get_remaining().starts_with("/")) {
+                state = State::SpecialAuthorityIgnoreSlashes;
+                ++iterator;
+            } else {
+                report_validation_error();
+                state = State::SpecialAuthorityIgnoreSlashes;
+                continue;
+            }
+            break;
+        case State::SpecialAuthorityIgnoreSlashes:
+            if (code_point != '/' && code_point != '\\') {
+                state = State::Authority;
+                continue;
+            } else {
+                report_validation_error();
+            }
+            break;
+        case State::Authority:
+            if (code_point == '@') {
+                report_validation_error();
+                if (at_sign_seen) {
+                    auto content = buffer.to_string();
+                    buffer.clear();
+                    buffer.append("%40");
+                    buffer.append(content);
+                }
+                at_sign_seen = true;
+                StringBuilder builder;
+                for (auto c : Utf8View(StringView(buffer.to_string()))) {
+                    if (c == ':' && !password_token_seen) {
+                        password_token_seen = true;
+                        continue;
+                    }
+                    builder.clear();
+                    if (password_token_seen) {
+                        builder.append(url.password());
+                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
+                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
+                        url.m_password = URL::percent_decode(builder.to_string());
+                    } else {
+                        builder.append(url.username());
+                        URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
+                        // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
+                        url.m_username = URL::percent_decode(builder.to_string());
+                    }
+                }
+                buffer.clear();
+            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+                if (at_sign_seen && buffer.is_empty()) {
+                    report_validation_error();
+                    return {};
+                }
+                // NOTE: This decreases the iterator by the number of code points in buffer plus one.
+                iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
+                buffer.clear();
+                state = State::Host;
+            } else {
+                buffer.append_code_point(code_point);
+            }
+            break;
+        case State::Host:
+        case State::Hostname:
+            if (code_point == ':' && !inside_brackets) {
+                if (buffer.is_empty()) {
+                    report_validation_error();
+                    return {};
+                }
+                auto host = parse_host(buffer.to_string(), !url.is_special());
+                if (!host.has_value())
+                    return {};
+                url.m_host = host.release_value();
+                buffer.clear();
+                state = State::Port;
+            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+                if (url.is_special() && buffer.is_empty()) {
+                    report_validation_error();
+                    return {};
+                }
+                auto host = parse_host(buffer.to_string(), !url.is_special());
+                if (!host.has_value())
+                    return {};
+                url.m_host = host.value();
+                buffer.clear();
+                state = State::Port;
+                continue;
+            } else if (code_point == '[') {
+                inside_brackets = true;
+            } else if (code_point == ']') {
+                inside_brackets = false;
+            } else {
+                buffer.append_code_point(code_point);
+            }
+            break;
+        case State::Port:
+            if (is_ascii_digit(code_point)) {
+                buffer.append_code_point(code_point);
+            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+                if (!buffer.is_empty()) {
+                    auto port = buffer.to_string().to_uint();
+                    if (!port.has_value() || port.value() > 65535) {
+                        report_validation_error();
+                        return {};
+                    }
+                    if (port.value() == URL::default_port_for_scheme(url.scheme()))
+                        url.m_port = 0;
+                    else
+                        url.m_port = port.value();
+                    buffer.clear();
+                }
+                state = State::PathStart;
+                continue;
+            } else {
+                report_validation_error();
+                return {};
+            }
+            break;
+        case State::File:
+            url.m_scheme = "file";
+            url.m_host = "";
+            if (code_point == '/' || code_point == '\\') {
+                if (code_point == '\\')
+                    report_validation_error();
+                state = State::FileSlash;
+            } else if (base_url && base_url->m_scheme == "file") {
+                url.m_host = base_url->m_host;
+                url.m_paths = base_url->m_paths;
+                url.m_query = base_url->m_query;
+                if (code_point == '?') {
+                    url.m_query = "";
+                    state = State::Query;
+                } else if (code_point == '#') {
+                    url.m_fragment = "";
+                    state = State::Fragment;
+                } else if (code_point != 0) {
+                    url.m_query = {};
+                    auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
+                    if (!starts_with_windows_drive_letter(substring_from_pointer)) {
+                        if (!url.paths().is_empty() && !(url.scheme() == "file" && url.paths().size() == 1 && is_normalized_windows_drive_letter(url.paths()[0])))
+                            url.m_paths.remove(url.m_paths.size() - 1);
+                    } else {
+                        report_validation_error();
+                        url.m_paths.clear();
+                    }
+                    state = State::Path;
+                    continue;
+                }
+            }
+            break;
+        case State::FileSlash:
+            if (code_point == '/' || code_point == '\\') {
+                if (code_point == '\\')
+                    report_validation_error();
+                state = State::FileHost;
+            } else if (base_url && base_url->m_scheme == "file") {
+                url.m_host = base_url->m_host;
+                auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
+                if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
+                    url.append_path(base_url->m_paths[0]);
+                state = State::Path;
+                continue;
+            }
+            break;
+        case State::FileHost:
+            if (code_point == 0 || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
+                if (is_windows_drive_letter(buffer.to_string())) {
+                    report_validation_error();
+                    state = State::Path;
+                } else if (buffer.is_empty()) {
+                    url.m_host = "";
+                    state = State::PathStart;
+                } else {
+                    auto host = parse_host(buffer.to_string(), true);
+                    if (!host.has_value())
+                        return {};
+                    if (host.value() == "localhost")
+                        host = "";
+                    url.m_host = host.release_value();
+                    buffer.clear();
+                    state = State::PathStart;
+                }
+                continue;
+            } else {
+                buffer.append_code_point(code_point);
+            }
+            break;
+        case State::PathStart:
+            if (url.is_special()) {
+                if (code_point == '\\')
+                    report_validation_error();
+                state = State::Path;
+                if (code_point != '/' && code_point != '\\')
+                    continue;
+            } else if (code_point == '?') {
+                url.m_query = "";
+                state = State::Query;
+            } else if (code_point == '#') {
+                url.m_fragment = "";
+                state = State::Fragment;
+            } else if (code_point != 0) {
+                state = State::Path;
+                if (code_point != '/')
+                    continue;
+            }
+            break;
+        case State::Path:
+            if (code_point == 0 || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
+                if (url.is_special() && code_point == '\\')
+                    report_validation_error();
+                if (is_double_dot_path_segment(buffer.to_string())) {
+                    if (!url.m_paths.is_empty() && !(url.m_scheme == "file" && url.m_paths.size() == 1 && is_normalized_windows_drive_letter(url.m_paths[0])))
+                        url.m_paths.remove(url.m_paths.size() - 1);
+                    if (code_point != '/' && !(url.is_special() && code_point == '\\'))
+                        url.append_path("");
+                } else if (is_single_dot_path_segment(buffer.to_string()) && code_point != '/' && !(url.is_special() && code_point == '\\')) {
+                    url.append_path("");
+                } else if (!is_single_dot_path_segment(buffer.to_string())) {
+                    if (url.m_scheme == "file" && url.m_paths.is_empty() && is_windows_drive_letter(buffer.to_string())) {
+                        auto drive_letter = buffer.to_string()[0];
+                        buffer.clear();
+                        buffer.append(drive_letter);
+                        buffer.append(':');
+                    }
+                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
+                    url.append_path(URL::percent_decode(buffer.to_string()));
+                }
+                buffer.clear();
+                if (code_point == '?') {
+                    url.m_query = "";
+                    state = State::Query;
+                } else if (code_point == '#') {
+                    url.m_fragment = "";
+                    state = State::Fragment;
+                }
+            } else {
+                if (!is_url_code_point(code_point) && code_point != '%')
+                    report_validation_error();
+                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
+                URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
+            }
+            break;
+        case State::CannotBeABaseUrlPath:
+            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
+            // NOTE: Verify that the assumptions required for this simplification are correct.
+            VERIFY(url.m_paths.size() == 1 && url.m_paths[0].is_empty());
+            if (code_point == '?') {
+                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
+                url.m_paths[0] = URL::percent_decode(buffer.to_string());
+                url.m_query = "";
+                state = State::Query;
+            } else if (code_point == '#') {
+                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
+                url.m_paths[0] = URL::percent_decode(buffer.to_string());
+                url.m_fragment = "";
+                state = State::Fragment;
+            } else {
+                if (code_point != 0 && !is_url_code_point(code_point) && code_point != '%')
+                    report_validation_error();
+                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
+                if (code_point != 0) {
+                    URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
+                } else {
+                    // NOTE: This needs to be percent decoded since the member variables contain decoded data.
+                    url.m_paths[0] = URL::percent_decode(buffer.to_string());
+                }
+            }
+            break;
+        case State::Query:
+            if (code_point == '#' || code_point == 0) {
+                VERIFY(url.m_query == "");
+                auto query_percent_encode_set = url.is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
+                // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
+                url.m_query = URL::percent_decode(URL::percent_encode(buffer.to_string(), query_percent_encode_set));
+                buffer.clear();
+                if (code_point == '#') {
+                    url.m_fragment = "";
+                    state = State::Fragment;
+                }
+            } else if (code_point != 0) {
+                if (!is_url_code_point(code_point) && code_point != '%')
+                    report_validation_error();
+                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
+                buffer.append_code_point(code_point);
+            }
+            break;
+        case State::Fragment:
+            // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
+            if (code_point) {
+                if (!is_url_code_point(code_point) && code_point != '%')
+                    report_validation_error();
+                // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
+                buffer.append_code_point(code_point);
+            } else {
+                // NOTE: This needs to be percent decoded since the member variables contain decoded data.
+                url.m_fragment = URL::percent_decode(buffer.to_string());
+                buffer.clear();
+            }
+            break;
+        default:
+            VERIFY_NOT_REACHED();
+        }
+
+        if (iterator.done())
+            break;
+        ++iterator;
+    }
+
+    url.m_valid = true;
+    dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url.serialize());
+    return url;
+}
+
+}
author	Max Wipfli <mail@maxwipfli.ch>	2021-05-25 22:13:15 +0200
committer	Andreas Kling <kling@serenityos.org>	2021-06-01 09:28:05 +0200
commit	0d0ed4962f066bd390d55d25072c34532ba6db93 (patch)
tree	6fc966b3cd339b576ab145d6ea4ae33c60ca4e0c /AK/URLParser.cpp
parent	8a938a3e25951e3bf0ad833256d78c9188819a66 (diff)
download	serenity-0d0ed4962f066bd390d55d25072c34532ba6db93.zip