diff options
author | Max Wipfli <mail@maxwipfli.ch> | 2021-05-25 22:13:15 +0200 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-06-01 09:28:05 +0200 |
commit | 0d0ed4962f066bd390d55d25072c34532ba6db93 (patch) | |
tree | 6fc966b3cd339b576ab145d6ea4ae33c60ca4e0c /AK/URLParser.cpp | |
parent | 8a938a3e25951e3bf0ad833256d78c9188819a66 (diff) | |
download | serenity-0d0ed4962f066bd390d55d25072c34532ba6db93.zip |
AK: Add a new, spec-compliant URLParser
This adds a new URL parser, which aims to be compliant with the URL
specification (https://url.spec.whatwg.org/). It also contains a
rudimentary data URL parser.
Diffstat (limited to 'AK/URLParser.cpp')
-rw-r--r-- | AK/URLParser.cpp | 688 |
1 files changed, 688 insertions, 0 deletions
diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp new file mode 100644 index 0000000000..8b5a9ffb6f --- /dev/null +++ b/AK/URLParser.cpp @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch> + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include <AK/Debug.h> +#include <AK/Optional.h> +#include <AK/SourceLocation.h> +#include <AK/String.h> +#include <AK/StringBuilder.h> +#include <AK/StringUtils.h> +#include <AK/URLParser.h> +#include <AK/Utf8View.h> + +namespace AK { + +constexpr bool is_ascii_alpha(u32 code_point) +{ + return ('a' <= code_point && code_point <= 'z') || ('A' <= code_point && code_point <= 'Z'); +} + +constexpr bool is_ascii_digit(u32 code_point) +{ + return '0' <= code_point && code_point <= '9'; +} + +constexpr bool is_ascii_alphanumeric(u32 code_point) +{ + return is_ascii_alpha(code_point) || is_ascii_digit(code_point); +} + +constexpr bool is_ascii_hex_digit(u32 code_point) +{ + return is_ascii_digit(code_point) || (code_point >= 'a' && code_point <= 'f') || (code_point >= 'A' && code_point <= 'F'); +} + +constexpr bool is_url_code_point(u32 code_point) +{ + // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters. + return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point); +} + +static void report_validation_error(const SourceLocation& location = SourceLocation::current()) +{ + dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location); +} + +static Optional<String> parse_opaque_host(const StringView& input) +{ + auto forbidden_host_code_points_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv; + for (auto code_point : forbidden_host_code_points_excluding_percent) { + if (input.contains(code_point)) { + report_validation_error(); + return {}; + } + } + // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error. + // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error. + return URL::percent_encode(input, URL::PercentEncodeSet::C0Control); +} + +static Optional<String> parse_ipv4_address(const StringView& input) +{ + // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser. + return input; +} + +// https://url.spec.whatwg.org/#concept-host-parser +// NOTE: This is a very bare-bones implementation. +static Optional<String> parse_host(const StringView& input, bool is_not_special = false) +{ + if (input.starts_with('[')) { + if (!input.ends_with(']')) { + report_validation_error(); + return {}; + } + // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed. + TODO(); + } + + if (is_not_special) + return parse_opaque_host(input); + VERIFY(!input.is_empty()); + + // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input. + auto domain = URL::percent_decode(input); + // FIXME: Let asciiDomain be the result of running domain to ASCII on domain. + auto& ascii_domain = domain; + + auto forbidden_host_code_points = "\0\t\n\r #%/:<>?@[\\]^|"sv; + for (auto code_point : forbidden_host_code_points) { + if (ascii_domain.view().contains(code_point)) { + report_validation_error(); + return {}; + } + } + + auto ipv4_host = parse_ipv4_address(ascii_domain); + return ipv4_host; +} + +constexpr bool starts_with_windows_drive_letter(const StringView& input) +{ + if (input.length() < 2) + return false; + if (!is_ascii_alpha(input[0]) && !(input[1] == ':' || input[1] == '|')) + return false; + if (input.length() == 2) + return true; + return "/\\?#"sv.contains(input[2]); +} + +constexpr bool is_windows_drive_letter(const StringView& input) +{ + return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|'); +} + +constexpr bool is_normalized_windows_drive_letter(const StringView& input) +{ + return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':'; +} + +constexpr bool is_single_dot_path_segment(const StringView& input) +{ + return input == "."sv || input.equals_ignoring_case("%2e"sv); +} + +constexpr bool is_double_dot_path_segment(const StringView& input) +{ + return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv); +} + +// https://fetch.spec.whatwg.org/#data-urls +// FIXME: This only loosely follow the spec, as we use the same class for "regular" and data URLs, unlike the spec. +Optional<URL> URLParser::parse_data_url(const StringView& raw_input) +{ + dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input); + VERIFY(raw_input.starts_with("data:")); + auto input = raw_input.substring_view(5); + auto comma_offset = input.find(','); + if (!comma_offset.has_value()) + return {}; + auto mime_type = input.substring_view(0, comma_offset.value()); + // FIXME: Strip leading and trailing ASCII whitespace from mimeType + auto encoded_body = input.substring_view(comma_offset.value() + 1); + auto body = URL::percent_decode(encoded_body); + bool is_base_64_encoded = false; + if (mime_type.ends_with(";base64", CaseSensitivity::CaseInsensitive)) { + is_base_64_encoded = true; + mime_type = mime_type.substring_view(0, mime_type.length() - 7); + } + + if (mime_type.starts_with(";")) { + StringBuilder builder; + builder.append("text/plain"); + builder.append(mime_type); + mime_type = builder.to_string(); + } + + URL url { mime_type, move(body), is_base_64_encoded }; + dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize()); + return url; +} + +// https://url.spec.whatwg.org/#concept-basic-url-parser +// NOTE: This parser assumes a UTF-8 encoding. +// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in +// validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the +// future for validation of URLs, which would then lead to infinite recursion. +// The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member +// variables' values here, not what the URL class presents to its users. +// NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting +// some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode +// everything before setting the member variables. +URL URLParser::parse(Badge<URL>, const StringView& raw_input, URL const* base_url) +{ + dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input); + if (raw_input.is_empty()) + return {}; + + if (raw_input.starts_with("data:")) { + auto maybe_url = parse_data_url(raw_input); + if (!maybe_url.has_value()) + return {}; + return maybe_url.release_value(); + } + + URL url; + + // NOTE: This removes all leading and trailing C0 control or space characters. + bool has_validation_error = false; + size_t start_index = 0; + size_t end_index = raw_input.length(); + for (size_t i = 0; i < raw_input.length(); ++i) { + if (raw_input[i] <= 0x20) { + ++start_index; + has_validation_error = true; + } + } + for (size_t i = 0; i < raw_input.length(); ++i) { + if (raw_input[raw_input.length() - 1 - i] <= 0x20) { + --end_index; + has_validation_error = true; + } + } + if (has_validation_error) + report_validation_error(); + if (start_index >= end_index) + return {}; + + auto processed_input = raw_input.substring_view(start_index, end_index - start_index); + + // NOTE: This replaces all tab and newline characters with nothing. + if (processed_input.contains("\t") || processed_input.contains("\n")) { + report_validation_error(); + String processed_input_string(processed_input); + processed_input_string.replace("\t", "", true); + processed_input_string.replace("\n", "", true); + processed_input = processed_input_string; + } + + State state = State::SchemeStart; + StringBuilder buffer; + bool at_sign_seen = false; + bool inside_brackets = false; + bool password_token_seen = false; + + Utf8View input(processed_input); + Utf8CodepointIterator iterator = input.begin(); + + auto get_remaining = [&input, &iterator] { + return input.substring_view(iterator - input.begin() + iterator.code_point_length_in_bytes()).as_string(); + }; + + // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop. + // ++iterator : "increase pointer by 1" + // continue : "decrease pointer by 1" + // NOTE: The NULL code point is used as the "EOF code point". + for (;;) { + u32 code_point = 0; + if (!iterator.done()) + code_point = *iterator; + + if constexpr (URL_PARSER_DEBUG) { + if (code_point) + dbgln("URLParser::parse: State {:2d} with code point '{:c}' (U+{:04X}).", (int)state, code_point, code_point); + else + dbgln("URLParser::parse: State {:2d} with code point EOF (U+0000).", (int)state); + } + + switch (state) { + case State::SchemeStart: + if (is_ascii_alpha(code_point)) { + buffer.append_as_lowercase(code_point); + state = State::Scheme; + } else { + state = State::NoScheme; + continue; + } + break; + case State::Scheme: + if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') { + buffer.append_as_lowercase(code_point); + } else if (code_point == ':') { + url.m_scheme = buffer.to_string(); + buffer.clear(); + if (url.scheme() == "file") { + if (!get_remaining().starts_with("//")) { + report_validation_error(); + } + state = State::File; + } else if (url.is_special()) { + if (base_url && base_url->m_scheme == url.m_scheme) + state = State::SpecialRelativeOrAuthority; + else + state = State::SpecialAuthoritySlashes; + } else if (get_remaining().starts_with("/")) { + state = State::PathOrAuthority; + ++iterator; + } else { + url.m_cannot_be_a_base_url = true; + url.append_path(""); + state = State::CannotBeABaseUrlPath; + } + } else { + buffer.clear(); + state = State::NoScheme; + iterator = input.begin(); + continue; + } + break; + case State::NoScheme: + if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) { + report_validation_error(); + return {}; + } else if (base_url->m_cannot_be_a_base_url && code_point == '#') { + url.m_scheme = base_url->m_scheme; + url.m_paths = base_url->m_paths; + url.m_query = base_url->m_query; + url.m_fragment = ""; + url.m_cannot_be_a_base_url = true; + state = State::Fragment; + } else if (base_url->m_scheme != "file") { + state = State::Relative; + continue; + } else { + state = State::File; + continue; + } + break; + case State::SpecialRelativeOrAuthority: + if (code_point == '/' && get_remaining().starts_with("/")) { + state = State::SpecialAuthorityIgnoreSlashes; + ++iterator; + } else { + report_validation_error(); + state = State::Relative; + continue; + } + break; + case State::PathOrAuthority: + if (code_point == '/') { + state = State::Authority; + } else { + state = State::Path; + continue; + } + break; + case State::Relative: + url.m_scheme = base_url->m_scheme; + if (code_point == '/') { + state = State::RelativeSlash; + } else if (url.is_special() && code_point == '\\') { + report_validation_error(); + state = State::RelativeSlash; + } else { + url.m_username = base_url->m_username; + url.m_password = base_url->m_password; + url.m_host = base_url->m_host; + url.m_port = base_url->m_port; + url.m_paths = base_url->m_paths; + url.m_query = base_url->m_query; + + if (code_point == '?') { + url.m_query = ""; + state = State::Query; + } else if (code_point == '#') { + url.m_fragment = ""; + state = State::Fragment; + } else if (code_point != 0) { + url.m_query = {}; + if (url.m_paths.size()) + url.m_paths.remove(url.m_paths.size() - 1); + state = State::Path; + continue; + } + } + break; + case State::RelativeSlash: + if (url.is_special() && (code_point == '/' || code_point == '\\')) { + if (code_point == '\\') + report_validation_error(); + state = State::SpecialAuthorityIgnoreSlashes; + } else if (code_point == '/') { + state = State::Authority; + } else { + url.m_username = base_url->m_username; + url.m_password = base_url->m_password; + url.m_host = base_url->m_host; + url.m_port = base_url->m_port; + state = State::Path; + continue; + } + break; + case State::SpecialAuthoritySlashes: + if (code_point == '/' && get_remaining().starts_with("/")) { + state = State::SpecialAuthorityIgnoreSlashes; + ++iterator; + } else { + report_validation_error(); + state = State::SpecialAuthorityIgnoreSlashes; + continue; + } + break; + case State::SpecialAuthorityIgnoreSlashes: + if (code_point != '/' && code_point != '\\') { + state = State::Authority; + continue; + } else { + report_validation_error(); + } + break; + case State::Authority: + if (code_point == '@') { + report_validation_error(); + if (at_sign_seen) { + auto content = buffer.to_string(); + buffer.clear(); + buffer.append("%40"); + buffer.append(content); + } + at_sign_seen = true; + StringBuilder builder; + for (auto c : Utf8View(StringView(buffer.to_string()))) { + if (c == ':' && !password_token_seen) { + password_token_seen = true; + continue; + } + builder.clear(); + if (password_token_seen) { + builder.append(url.password()); + URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo); + // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. + url.m_password = URL::percent_decode(builder.to_string()); + } else { + builder.append(url.username()); + URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo); + // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. + url.m_username = URL::percent_decode(builder.to_string()); + } + } + buffer.clear(); + } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) { + if (at_sign_seen && buffer.is_empty()) { + report_validation_error(); + return {}; + } + // NOTE: This decreases the iterator by the number of code points in buffer plus one. + iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1); + buffer.clear(); + state = State::Host; + } else { + buffer.append_code_point(code_point); + } + break; + case State::Host: + case State::Hostname: + if (code_point == ':' && !inside_brackets) { + if (buffer.is_empty()) { + report_validation_error(); + return {}; + } + auto host = parse_host(buffer.to_string(), !url.is_special()); + if (!host.has_value()) + return {}; + url.m_host = host.release_value(); + buffer.clear(); + state = State::Port; + } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) { + if (url.is_special() && buffer.is_empty()) { + report_validation_error(); + return {}; + } + auto host = parse_host(buffer.to_string(), !url.is_special()); + if (!host.has_value()) + return {}; + url.m_host = host.value(); + buffer.clear(); + state = State::Port; + continue; + } else if (code_point == '[') { + inside_brackets = true; + } else if (code_point == ']') { + inside_brackets = false; + } else { + buffer.append_code_point(code_point); + } + break; + case State::Port: + if (is_ascii_digit(code_point)) { + buffer.append_code_point(code_point); + } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) { + if (!buffer.is_empty()) { + auto port = buffer.to_string().to_uint(); + if (!port.has_value() || port.value() > 65535) { + report_validation_error(); + return {}; + } + if (port.value() == URL::default_port_for_scheme(url.scheme())) + url.m_port = 0; + else + url.m_port = port.value(); + buffer.clear(); + } + state = State::PathStart; + continue; + } else { + report_validation_error(); + return {}; + } + break; + case State::File: + url.m_scheme = "file"; + url.m_host = ""; + if (code_point == '/' || code_point == '\\') { + if (code_point == '\\') + report_validation_error(); + state = State::FileSlash; + } else if (base_url && base_url->m_scheme == "file") { + url.m_host = base_url->m_host; + url.m_paths = base_url->m_paths; + url.m_query = base_url->m_query; + if (code_point == '?') { + url.m_query = ""; + state = State::Query; + } else if (code_point == '#') { + url.m_fragment = ""; + state = State::Fragment; + } else if (code_point != 0) { + url.m_query = {}; + auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string(); + if (!starts_with_windows_drive_letter(substring_from_pointer)) { + if (!url.paths().is_empty() && !(url.scheme() == "file" && url.paths().size() == 1 && is_normalized_windows_drive_letter(url.paths()[0]))) + url.m_paths.remove(url.m_paths.size() - 1); + } else { + report_validation_error(); + url.m_paths.clear(); + } + state = State::Path; + continue; + } + } + break; + case State::FileSlash: + if (code_point == '/' || code_point == '\\') { + if (code_point == '\\') + report_validation_error(); + state = State::FileHost; + } else if (base_url && base_url->m_scheme == "file") { + url.m_host = base_url->m_host; + auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string(); + if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0])) + url.append_path(base_url->m_paths[0]); + state = State::Path; + continue; + } + break; + case State::FileHost: + if (code_point == 0 || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') { + if (is_windows_drive_letter(buffer.to_string())) { + report_validation_error(); + state = State::Path; + } else if (buffer.is_empty()) { + url.m_host = ""; + state = State::PathStart; + } else { + auto host = parse_host(buffer.to_string(), true); + if (!host.has_value()) + return {}; + if (host.value() == "localhost") + host = ""; + url.m_host = host.release_value(); + buffer.clear(); + state = State::PathStart; + } + continue; + } else { + buffer.append_code_point(code_point); + } + break; + case State::PathStart: + if (url.is_special()) { + if (code_point == '\\') + report_validation_error(); + state = State::Path; + if (code_point != '/' && code_point != '\\') + continue; + } else if (code_point == '?') { + url.m_query = ""; + state = State::Query; + } else if (code_point == '#') { + url.m_fragment = ""; + state = State::Fragment; + } else if (code_point != 0) { + state = State::Path; + if (code_point != '/') + continue; + } + break; + case State::Path: + if (code_point == 0 || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') { + if (url.is_special() && code_point == '\\') + report_validation_error(); + if (is_double_dot_path_segment(buffer.to_string())) { + if (!url.m_paths.is_empty() && !(url.m_scheme == "file" && url.m_paths.size() == 1 && is_normalized_windows_drive_letter(url.m_paths[0]))) + url.m_paths.remove(url.m_paths.size() - 1); + if (code_point != '/' && !(url.is_special() && code_point == '\\')) + url.append_path(""); + } else if (is_single_dot_path_segment(buffer.to_string()) && code_point != '/' && !(url.is_special() && code_point == '\\')) { + url.append_path(""); + } else if (!is_single_dot_path_segment(buffer.to_string())) { + if (url.m_scheme == "file" && url.m_paths.is_empty() && is_windows_drive_letter(buffer.to_string())) { + auto drive_letter = buffer.to_string()[0]; + buffer.clear(); + buffer.append(drive_letter); + buffer.append(':'); + } + // NOTE: This needs to be percent decoded since the member variables contain decoded data. + url.append_path(URL::percent_decode(buffer.to_string())); + } + buffer.clear(); + if (code_point == '?') { + url.m_query = ""; + state = State::Query; + } else if (code_point == '#') { + url.m_fragment = ""; + state = State::Fragment; + } + } else { + if (!is_url_code_point(code_point) && code_point != '%') + report_validation_error(); + // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. + URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path); + } + break; + case State::CannotBeABaseUrlPath: + // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF. + // NOTE: Verify that the assumptions required for this simplification are correct. + VERIFY(url.m_paths.size() == 1 && url.m_paths[0].is_empty()); + if (code_point == '?') { + // NOTE: This needs to be percent decoded since the member variables contain decoded data. + url.m_paths[0] = URL::percent_decode(buffer.to_string()); + url.m_query = ""; + state = State::Query; + } else if (code_point == '#') { + // NOTE: This needs to be percent decoded since the member variables contain decoded data. + url.m_paths[0] = URL::percent_decode(buffer.to_string()); + url.m_fragment = ""; + state = State::Fragment; + } else { + if (code_point != 0 && !is_url_code_point(code_point) && code_point != '%') + report_validation_error(); + // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. + if (code_point != 0) { + URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control); + } else { + // NOTE: This needs to be percent decoded since the member variables contain decoded data. + url.m_paths[0] = URL::percent_decode(buffer.to_string()); + } + } + break; + case State::Query: + if (code_point == '#' || code_point == 0) { + VERIFY(url.m_query == ""); + auto query_percent_encode_set = url.is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query; + // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences. + url.m_query = URL::percent_decode(URL::percent_encode(buffer.to_string(), query_percent_encode_set)); + buffer.clear(); + if (code_point == '#') { + url.m_fragment = ""; + state = State::Fragment; + } + } else if (code_point != 0) { + if (!is_url_code_point(code_point) && code_point != '%') + report_validation_error(); + // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. + buffer.append_code_point(code_point); + } + break; + case State::Fragment: + // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF. + if (code_point) { + if (!is_url_code_point(code_point) && code_point != '%') + report_validation_error(); + // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error. + buffer.append_code_point(code_point); + } else { + // NOTE: This needs to be percent decoded since the member variables contain decoded data. + url.m_fragment = URL::percent_decode(buffer.to_string()); + buffer.clear(); + } + break; + default: + VERIFY_NOT_REACHED(); + } + + if (iterator.done()) + break; + ++iterator; + } + + url.m_valid = true; + dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url.serialize()); + return url; +} + +} |