summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAnotherTest <ali.mpfard@gmail.com>2020-12-06 17:04:28 +0330
committerAndreas Kling <kling@serenityos.org>2020-12-06 15:38:40 +0100
commit765d2977bcc6459c7f3745f2c96d908bae265bff (patch)
tree7b0088c234805f0a92e0cfe300163642217621af
parent86811683b071415c47668781ab5fd7241f85ea55 (diff)
downloadserenity-765d2977bcc6459c7f3745f2c96d908bae265bff.zip
LibRegex: Add basic support for unicode escapes in ECMA262Parser
This parses unicode escapes (and matches them only for utf8 strings).
-rw-r--r--Libraries/LibRegex/RegexParser.cpp32
-rw-r--r--Libraries/LibRegex/Tests/Regex.cpp3
2 files changed, 31 insertions, 4 deletions
diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp
index ca86012f53..6015212123 100644
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@@ -1066,8 +1066,24 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
if (try_skip("u")) {
- // FIXME: Implement this path, unicode escape sequence.
- TODO();
+ if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
+ // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
+ match_length_minimum += 1;
+ StringBuilder builder;
+ builder.append_code_point(code_point.value());
+ // FIXME: This isn't actually correct for ECMAScript.
+ auto u8_encoded = builder.string_view();
+ stack.insert_bytecode_compare_string(u8_encoded);
+ return true;
+ } else if (!unicode) {
+ // '\u' is allowed in non-unicode mode, just matches 'u'.
+ match_length_minimum += 1;
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
+ return true;
+ } else {
+ set_error(Error::InvalidPattern);
+ return false;
+ }
}
// IdentityEscape
@@ -1261,8 +1277,16 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
}
if (try_skip("u")) {
- // FIXME: Implement this path, unicode escape sequence.
- TODO();
+ if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
+ // FIXME: While codepoint ranges are supported, codepoint matches as "Char" are not!
+ return { { .code_point = code_point.value(), .is_character_class = false } };
+ } else if (!unicode) {
+ // '\u' is allowed in non-unicode mode, just matches 'u'.
+ return { { .code_point = 'u', .is_character_class = false } };
+ } else {
+ set_error(Error::InvalidPattern);
+ return {};
+ }
}
if (unicode) {
diff --git a/Libraries/LibRegex/Tests/Regex.cpp b/Libraries/LibRegex/Tests/Regex.cpp
index c4aa8c8d7e..d9c5f7ce31 100644
--- a/Libraries/LibRegex/Tests/Regex.cpp
+++ b/Libraries/LibRegex/Tests/Regex.cpp
@@ -477,6 +477,7 @@ TEST_CASE(ECMA262_parse)
struct _test {
const char* pattern;
regex::Error expected_error { regex::Error::NoError };
+ regex::ECMAScriptFlags flags {};
};
constexpr _test tests[] {
@@ -497,6 +498,8 @@ TEST_CASE(ECMA262_parse)
{ "\\x" }, // Even invalid escapes are allowed if ~unicode.
{ "\\", regex::Error::InvalidTrailingEscape },
{ "(?", regex::Error::InvalidCaptureGroup },
+ { "\\u1234", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
+ { "[\\u1234]", regex::Error::NoError, regex::ECMAScriptFlags::Unicode },
};
for (auto& test : tests) {