AK: Stop using U+0000 as end of file code point in URL parser

This changes URL parser to use the 0xFFFFFFFF constant instead of 0 to indicate end of file. This fixes a bug where inputs containing null bytes would terminate the parser early, because they were interpreted as end of file.
author: Max Wipfli <mail@maxwipfli.ch> 2021-06-03 12:43:08 +0200
committer: Andreas Kling <kling@serenityos.org> 2021-06-05 10:53:31 +0200
commit: 2164d8aae8ad5b7339cc88f288993fef06ad89f6 (patch)
tree: 3d6b1ed487fc238de627c49861f93ea1f5a8ab7e /AK
parent: 97425c7dfb8fd7728617601d1bff3203f409a7e8 (diff)
download: serenity-2164d8aae8ad5b7339cc88f288993fef06ad89f6.zip
1 files changed, 18 insertions, 16 deletions
diff --git a/AK/URLParser.cpp b/AK/URLParser.cpp
index 880ef04a50..847a8bf002 100644
--- a/AK/URLParser.cpp
+++ b/AK/URLParser.cpp
@@ -16,6 +16,9 @@
 
 namespace AK {
 
+// NOTE: This is similar to the LibC macro EOF = -1.
+constexpr u32 end_of_file = 0xFFFFFFFF;
+
 constexpr bool is_url_code_point(u32 code_point)
 {
     // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
@@ -221,14 +224,13 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
     // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
     //       ++iterator : "increase pointer by 1"
     //       continue   : "decrease pointer by 1"
-    // NOTE: The NULL code point is used as the "EOF code point".
     for (;;) {
-        u32 code_point = 0;
+        u32 code_point = end_of_file;
         if (!iterator.done())
             code_point = *iterator;
 
         if constexpr (URL_PARSER_DEBUG) {
-            if (!code_point)
+            if (code_point == end_of_file)
                 dbgln("URLParser::parse: {} state with EOF.", state_name(state));
             else if (is_ascii_printable(code_point))
                 dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
@@ -335,7 +337,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                 } else if (code_point == '#') {
                     url.m_fragment = "";
                     state = State::Fragment;
-                } else if (code_point != 0) {
+                } else if (code_point != end_of_file) {
                     url.m_query = {};
                     if (url.m_paths.size())
                         url.m_paths.remove(url.m_paths.size() - 1);
@@ -408,7 +410,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                     }
                 }
                 buffer.clear();
-            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
                 if (at_sign_seen && buffer.is_empty()) {
                     report_validation_error();
                     return {};
@@ -434,7 +436,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                 url.m_host = host.release_value();
                 buffer.clear();
                 state = State::Port;
-            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
                 if (url.is_special() && buffer.is_empty()) {
                     report_validation_error();
                     return {};
@@ -457,7 +459,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
         case State::Port:
             if (is_ascii_digit(code_point)) {
                 buffer.append_code_point(code_point);
-            } else if (code_point == 0 || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
+            } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url.is_special() && code_point == '\\')) {
                 if (!buffer.is_empty()) {
                     auto port = buffer.to_string().to_uint();
                     if (!port.has_value() || port.value() > 65535) {
@@ -494,7 +496,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                 } else if (code_point == '#') {
                     url.m_fragment = "";
                     state = State::Fragment;
-                } else if (code_point != 0) {
+                } else if (code_point != end_of_file) {
                     url.m_query = {};
                     auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
                     if (!starts_with_windows_drive_letter(substring_from_pointer)) {
@@ -524,7 +526,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
             }
             break;
         case State::FileHost:
-            if (code_point == 0 || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
+            if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
                 if (is_windows_drive_letter(buffer.to_string())) {
                     report_validation_error();
                     state = State::Path;
@@ -559,14 +561,14 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
             } else if (code_point == '#') {
                 url.m_fragment = "";
                 state = State::Fragment;
-            } else if (code_point != 0) {
+            } else if (code_point != end_of_file) {
                 state = State::Path;
                 if (code_point != '/')
                     continue;
             }
             break;
         case State::Path:
-            if (code_point == 0 || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
+            if (code_point == end_of_file || code_point == '/' || (url.is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
                 if (url.is_special() && code_point == '\\')
                     report_validation_error();
                 if (is_double_dot_path_segment(buffer.to_string())) {
@@ -616,10 +618,10 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                 url.m_fragment = "";
                 state = State::Fragment;
             } else {
-                if (code_point != 0 && !is_url_code_point(code_point) && code_point != '%')
+                if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
                     report_validation_error();
                 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
-                if (code_point != 0) {
+                if (code_point != end_of_file) {
                     URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
                 } else {
                     // NOTE: This needs to be percent decoded since the member variables contain decoded data.
@@ -628,7 +630,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
             }
             break;
         case State::Query:
-            if (code_point == '#' || code_point == 0) {
+            if (code_point == end_of_file || code_point == '#') {
                 VERIFY(url.m_query == "");
                 auto query_percent_encode_set = url.is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
                 // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
@@ -638,7 +640,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
                     url.m_fragment = "";
                     state = State::Fragment;
                 }
-            } else if (code_point != 0) {
+            } else if (code_point != end_of_file) {
                 if (!is_url_code_point(code_point) && code_point != '%')
                     report_validation_error();
                 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
@@ -647,7 +649,7 @@ URL URLParser::parse(Badge<URL>, StringView const& raw_input, URL const* base_ur
             break;
         case State::Fragment:
             // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
-            if (code_point) {
+            if (code_point != end_of_file) {
                 if (!is_url_code_point(code_point) && code_point != '%')
                     report_validation_error();
                 // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
author	Max Wipfli <mail@maxwipfli.ch>	2021-06-03 12:43:08 +0200
committer	Andreas Kling <kling@serenityos.org>	2021-06-05 10:53:31 +0200
commit	2164d8aae8ad5b7339cc88f288993fef06ad89f6 (patch)
tree	3d6b1ed487fc238de627c49861f93ea1f5a8ab7e /AK
parent	97425c7dfb8fd7728617601d1bff3203f409a7e8 (diff)
download	serenity-2164d8aae8ad5b7339cc88f288993fef06ad89f6.zip