diff options
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 6 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatch.h | 102 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatcher.cpp | 4 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 31 |
4 files changed, 123 insertions, 20 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 677ad0cb0d..f5869acbdb 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; Optional<String> str; + Vector<u16> utf16; Vector<u32> data; data.ensure_capacity(length); for (size_t i = offset; i < offset + length; ++i) data.unchecked_append(m_bytecode->at(i)); - auto view = input.view.construct_as_same(data, str); + auto view = input.view.construct_as_same(data, str, utf16); offset += length; if (!compare_string(input, state, view, had_zero_length_match)) return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt auto input_view = input.view.substring_view(state.string_position, 1); Optional<String> str; - auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str); + Vector<u16> utf16; + auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16); bool equal; if (input.regex_options & AllFlags::Insensitive) equal = input_view.equals_ignoring_case(compare_view); diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index b58dc5e132..6bc58ad78f 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -14,6 +14,7 @@ #include <AK/String.h> #include <AK/StringBuilder.h> #include <AK/StringView.h> +#include <AK/Utf16View.h> #include <AK/Utf32View.h> #include <AK/Utf8View.h> #include <AK/Variant.h> @@ -43,6 +44,11 @@ public: { } + RegexStringView(Utf16View view) + : m_view(view) + { + } + RegexStringView(Utf8View view) : m_view(view) { @@ -58,11 +64,19 @@ public: return m_view.get<Utf32View>(); } + Utf16View const& u16_view() const + { + return m_view.get<Utf16View>(); + } + Utf8View const& u8_view() const { return m_view.get<Utf8View>(); } + bool unicode() const { return m_unicode; } + void set_unicode(bool unicode) { m_unicode = unicode; } + bool is_empty() const { return m_view.visit([](auto& view) { return view.is_empty(); }); @@ -75,12 +89,21 @@ public: size_t length() const { - return m_view.visit([](auto& view) { return view.length(); }); + if (unicode()) { + return m_view.visit( + [](Utf16View const& view) { return view.length_in_code_points(); }, + [](auto const& view) { return view.length(); }); + } + + return m_view.visit( + [](Utf16View const& view) { return view.length_in_code_units(); }, + [](Utf8View const& view) { return view.byte_length(); }, + [](auto const& view) { return view.length(); }); } - RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const + RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16>& optional_utf16_storage) const { - return m_view.visit( + auto view = m_view.visit( [&]<typename T>(T const&) { StringBuilder builder; for (auto ch : data) @@ -90,7 +113,14 @@ public: }, [&](Utf32View) { return RegexStringView { Utf32View { data.data(), data.size() } }; + }, + [&](Utf16View) { + optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }); + return RegexStringView { Utf16View { optional_utf16_storage } }; }); + + view.set_unicode(unicode()); + return view; } Vector<RegexStringView> lines() const @@ -118,6 +148,21 @@ public: views.empend(view); return views; }, + [](Utf16View view) { + Vector<RegexStringView> views; + u16 newline = '\n'; + while (!view.is_empty()) { + auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16)); + if (!position.has_value()) + break; + auto offset = position.value() / sizeof(u16); + views.empend(view.substring_view(0, offset)); + view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1); + } + if (!view.is_empty()) + views.empend(view); + return views; + }, [](Utf8View& view) { Vector<RegexStringView> views; auto it = view.begin(); @@ -147,15 +192,26 @@ public: RegexStringView substring_view(size_t offset, size_t length) const { - return m_view.visit( - [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, - [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); + if (unicode()) { + auto view = m_view.visit( + [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, + [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }, + [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); + + view.set_unicode(unicode()); + return view; + } + + auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }); + view.set_unicode(unicode()); + return view; } String to_string() const { return m_view.visit( [](StringView view) { return view.to_string(); }, + [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); }, [](auto& view) { StringBuilder builder; for (auto it = view.begin(); it != view.end(); ++it) @@ -173,8 +229,8 @@ public: return 256u + ch; return ch; }, - [&](auto view) -> u32 { return view[index]; }, - [&](Utf8View& view) -> u32 { + [&](Utf32View& view) -> u32 { return view[index]; }, + [&](auto& view) -> u32 { size_t i = index; for (auto it = view.begin(); it != view.end(); ++it, --i) { if (i == 0) @@ -188,6 +244,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == cstring; }, + [&](Utf16View) { return to_string() == cstring; }, [&](Utf8View const& view) { return view.as_string() == cstring; }, [&](StringView view) { return view == cstring; }); } @@ -201,6 +258,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == string; }, + [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } @@ -209,6 +267,7 @@ public: { return m_view.visit( [&](Utf32View) { return to_string() == string; }, + [&](Utf16View) { return to_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; }, [&](StringView view) { return view == string; }); } @@ -224,6 +283,7 @@ public: [&](Utf32View view) { return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; }, + [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); }, [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, [&](StringView view) { return view == RegexStringView { other }.to_string(); }); } @@ -233,12 +293,25 @@ public: return !(*this == other); } + bool operator==(Utf16View const& other) const + { + return m_view.visit( + [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); }, + [&](Utf16View const& view) { return view == other; }, + [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, + [&](StringView view) { return view == RegexStringView { other }.to_string(); }); + } + + bool operator!=(Utf16View const& other) const + { + return !(*this == other); + } + bool operator==(Utf8View const& other) const { return m_view.visit( - [&](Utf32View) { - return to_string() == other.as_string(); - }, + [&](Utf32View) { return to_string() == other.as_string(); }, + [&](Utf16View) { return to_string() == other.as_string(); }, [&](Utf8View const& view) { return view.as_string() == other.as_string(); }, [&](StringView view) { return other.as_string() == view; }); } @@ -271,6 +344,9 @@ public: [&](Utf32View) -> bool { TODO(); }, + [&](Utf16View) -> bool { + TODO(); + }, [&](Utf8View const& view) { return view.as_string().starts_with(str); }, [&](StringView view) { return view.starts_with(str); }); } @@ -289,6 +365,7 @@ public: } return true; }, + [&](Utf16View) -> bool { TODO(); }, [&](Utf8View const& view) { auto it = view.begin(); for (auto code_point : str) { @@ -304,7 +381,8 @@ public: } private: - Variant<StringView, Utf8View, Utf32View> m_view; + Variant<StringView, Utf8View, Utf16View, Utf32View> m_view; + bool m_unicode { false }; }; class Match final { diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index 60783b25f6..f4a848741a 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -84,6 +84,10 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional output.operations = 0; size_t lines_to_skip = 0; + bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode); + for (auto& view : views) + const_cast<RegexStringView&>(view).set_unicode(unicode); + if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { if (views.size() > 1 && input.start_offset > views.first().length()) { dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip); diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 12c62fef78..07885173b6 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -10,6 +10,7 @@ #include <AK/String.h> #include <AK/StringBuilder.h> #include <AK/StringUtils.h> +#include <AK/Utf16View.h> namespace regex { @@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini if (try_skip("u")) { if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) { - // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode. + // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be + // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit, + // but doesn't form a valid surrogate pair, insert bytecode for both code units individually. + Optional<u32> low_surrogate; + if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) { + low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); + if (!low_surrogate.has_value()) { + set_error(Error::InvalidPattern); + return false; + } + + if (Utf16View::is_low_surrogate(*low_surrogate)) { + *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate); + low_surrogate.clear(); + } + } + match_length_minimum += 1; - StringBuilder builder; - builder.append_code_point(code_point.value()); - // FIXME: This isn't actually correct for ECMAScript. - auto u8_encoded = builder.string_view(); - stack.insert_bytecode_compare_string(u8_encoded); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } }); + + if (low_surrogate.has_value()) { + match_length_minimum += 1; + stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } }); + } + return true; } else if (!unicode) { // '\u' is allowed in non-unicode mode, just matches 'u'. |