summaryrefslogtreecommitdiff
path: root/Userland
diff options
context:
space:
mode:
Diffstat (limited to 'Userland')
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.cpp6
-rw-r--r--Userland/Libraries/LibRegex/RegexMatch.h102
-rw-r--r--Userland/Libraries/LibRegex/RegexMatcher.cpp4
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp31
4 files changed, 123 insertions, 20 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index 677ad0cb0d..f5869acbdb 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
return ExecutionResult::Failed_ExecuteLowPrioForks;
Optional<String> str;
+ Vector<u16> utf16;
Vector<u32> data;
data.ensure_capacity(length);
for (size_t i = offset; i < offset + length; ++i)
data.unchecked_append(m_bytecode->at(i));
- auto view = input.view.construct_as_same(data, str);
+ auto view = input.view.construct_as_same(data, str, utf16);
offset += length;
if (!compare_string(input, state, view, had_zero_length_match))
return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
auto input_view = input.view.substring_view(state.string_position, 1);
Optional<String> str;
- auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str);
+ Vector<u16> utf16;
+ auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16);
bool equal;
if (input.regex_options & AllFlags::Insensitive)
equal = input_view.equals_ignoring_case(compare_view);
diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h
index b58dc5e132..6bc58ad78f 100644
--- a/Userland/Libraries/LibRegex/RegexMatch.h
+++ b/Userland/Libraries/LibRegex/RegexMatch.h
@@ -14,6 +14,7 @@
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
+#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <AK/Variant.h>
@@ -43,6 +44,11 @@ public:
{
}
+ RegexStringView(Utf16View view)
+ : m_view(view)
+ {
+ }
+
RegexStringView(Utf8View view)
: m_view(view)
{
@@ -58,11 +64,19 @@ public:
return m_view.get<Utf32View>();
}
+ Utf16View const& u16_view() const
+ {
+ return m_view.get<Utf16View>();
+ }
+
Utf8View const& u8_view() const
{
return m_view.get<Utf8View>();
}
+ bool unicode() const { return m_unicode; }
+ void set_unicode(bool unicode) { m_unicode = unicode; }
+
bool is_empty() const
{
return m_view.visit([](auto& view) { return view.is_empty(); });
@@ -75,12 +89,21 @@ public:
size_t length() const
{
- return m_view.visit([](auto& view) { return view.length(); });
+ if (unicode()) {
+ return m_view.visit(
+ [](Utf16View const& view) { return view.length_in_code_points(); },
+ [](auto const& view) { return view.length(); });
+ }
+
+ return m_view.visit(
+ [](Utf16View const& view) { return view.length_in_code_units(); },
+ [](Utf8View const& view) { return view.byte_length(); },
+ [](auto const& view) { return view.length(); });
}
- RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const
+ RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16>& optional_utf16_storage) const
{
- return m_view.visit(
+ auto view = m_view.visit(
[&]<typename T>(T const&) {
StringBuilder builder;
for (auto ch : data)
@@ -90,7 +113,14 @@ public:
},
[&](Utf32View) {
return RegexStringView { Utf32View { data.data(), data.size() } };
+ },
+ [&](Utf16View) {
+ optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
+ return RegexStringView { Utf16View { optional_utf16_storage } };
});
+
+ view.set_unicode(unicode());
+ return view;
}
Vector<RegexStringView> lines() const
@@ -118,6 +148,21 @@ public:
views.empend(view);
return views;
},
+ [](Utf16View view) {
+ Vector<RegexStringView> views;
+ u16 newline = '\n';
+ while (!view.is_empty()) {
+ auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
+ if (!position.has_value())
+ break;
+ auto offset = position.value() / sizeof(u16);
+ views.empend(view.substring_view(0, offset));
+ view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
+ }
+ if (!view.is_empty())
+ views.empend(view);
+ return views;
+ },
[](Utf8View& view) {
Vector<RegexStringView> views;
auto it = view.begin();
@@ -147,15 +192,26 @@ public:
RegexStringView substring_view(size_t offset, size_t length) const
{
- return m_view.visit(
- [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
- [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
+ if (unicode()) {
+ auto view = m_view.visit(
+ [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
+ [&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
+ [&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
+
+ view.set_unicode(unicode());
+ return view;
+ }
+
+ auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
+ view.set_unicode(unicode());
+ return view;
}
String to_string() const
{
return m_view.visit(
[](StringView view) { return view.to_string(); },
+ [](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
[](auto& view) {
StringBuilder builder;
for (auto it = view.begin(); it != view.end(); ++it)
@@ -173,8 +229,8 @@ public:
return 256u + ch;
return ch;
},
- [&](auto view) -> u32 { return view[index]; },
- [&](Utf8View& view) -> u32 {
+ [&](Utf32View& view) -> u32 { return view[index]; },
+ [&](auto& view) -> u32 {
size_t i = index;
for (auto it = view.begin(); it != view.end(); ++it, --i) {
if (i == 0)
@@ -188,6 +244,7 @@ public:
{
return m_view.visit(
[&](Utf32View) { return to_string() == cstring; },
+ [&](Utf16View) { return to_string() == cstring; },
[&](Utf8View const& view) { return view.as_string() == cstring; },
[&](StringView view) { return view == cstring; });
}
@@ -201,6 +258,7 @@ public:
{
return m_view.visit(
[&](Utf32View) { return to_string() == string; },
+ [&](Utf16View) { return to_string() == string; },
[&](Utf8View const& view) { return view.as_string() == string; },
[&](StringView view) { return view == string; });
}
@@ -209,6 +267,7 @@ public:
{
return m_view.visit(
[&](Utf32View) { return to_string() == string; },
+ [&](Utf16View) { return to_string() == string; },
[&](Utf8View const& view) { return view.as_string() == string; },
[&](StringView view) { return view == string; });
}
@@ -224,6 +283,7 @@ public:
[&](Utf32View view) {
return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
},
+ [&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
[&](StringView view) { return view == RegexStringView { other }.to_string(); });
}
@@ -233,12 +293,25 @@ public:
return !(*this == other);
}
+ bool operator==(Utf16View const& other) const
+ {
+ return m_view.visit(
+ [&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
+ [&](Utf16View const& view) { return view == other; },
+ [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
+ [&](StringView view) { return view == RegexStringView { other }.to_string(); });
+ }
+
+ bool operator!=(Utf16View const& other) const
+ {
+ return !(*this == other);
+ }
+
bool operator==(Utf8View const& other) const
{
return m_view.visit(
- [&](Utf32View) {
- return to_string() == other.as_string();
- },
+ [&](Utf32View) { return to_string() == other.as_string(); },
+ [&](Utf16View) { return to_string() == other.as_string(); },
[&](Utf8View const& view) { return view.as_string() == other.as_string(); },
[&](StringView view) { return other.as_string() == view; });
}
@@ -271,6 +344,9 @@ public:
[&](Utf32View) -> bool {
TODO();
},
+ [&](Utf16View) -> bool {
+ TODO();
+ },
[&](Utf8View const& view) { return view.as_string().starts_with(str); },
[&](StringView view) { return view.starts_with(str); });
}
@@ -289,6 +365,7 @@ public:
}
return true;
},
+ [&](Utf16View) -> bool { TODO(); },
[&](Utf8View const& view) {
auto it = view.begin();
for (auto code_point : str) {
@@ -304,7 +381,8 @@ public:
}
private:
- Variant<StringView, Utf8View, Utf32View> m_view;
+ Variant<StringView, Utf8View, Utf16View, Utf32View> m_view;
+ bool m_unicode { false };
};
class Match final {
diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp
index 60783b25f6..f4a848741a 100644
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@@ -84,6 +84,10 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
output.operations = 0;
size_t lines_to_skip = 0;
+ bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
+ for (auto& view : views)
+ const_cast<RegexStringView&>(view).set_unicode(unicode);
+
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
if (views.size() > 1 && input.start_offset > views.first().length()) {
dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 12c62fef78..07885173b6 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -10,6 +10,7 @@
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
+#include <AK/Utf16View.h>
namespace regex {
@@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
if (try_skip("u")) {
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
- // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
+ // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
+ // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
+ // but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
+ Optional<u32> low_surrogate;
+ if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
+ low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
+ if (!low_surrogate.has_value()) {
+ set_error(Error::InvalidPattern);
+ return false;
+ }
+
+ if (Utf16View::is_low_surrogate(*low_surrogate)) {
+ *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
+ low_surrogate.clear();
+ }
+ }
+
match_length_minimum += 1;
- StringBuilder builder;
- builder.append_code_point(code_point.value());
- // FIXME: This isn't actually correct for ECMAScript.
- auto u8_encoded = builder.string_view();
- stack.insert_bytecode_compare_string(u8_encoded);
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
+
+ if (low_surrogate.has_value()) {
+ match_length_minimum += 1;
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
+ }
+
return true;
} else if (!unicode) {
// '\u' is allowed in non-unicode mode, just matches 'u'.