summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibRegex
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-08-04 07:26:25 -0400
committerLinus Groh <mail@linusgroh.de>2021-08-04 13:50:32 +0100
commit484ccfadc366545d6969947a6d9fb48cb88be3cf (patch)
tree7d51794bc1bbbc116e4951c6d567622c200dd8f2 /Userland/Libraries/LibRegex
parent5edd4584206405a300209592ceb3edf65fed34e7 (diff)
downloadserenity-484ccfadc366545d6969947a6d9fb48cb88be3cf.zip
LibRegex: Support property escapes of Unicode script extensions
Diffstat (limited to 'Userland/Libraries/LibRegex')
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.cpp20
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.h2
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp24
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.h6
4 files changed, 45 insertions, 7 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index e9a6deab02..7cad14ba92 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -573,6 +573,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
compare_script(input, state, script, current_inversion_state(), inverse_matched);
+ } else if (compare_type == CharacterCompareType::ScriptExtension) {
+ auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
+ compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
+
} else {
warnln("Undefined comparison: {}", (int)compare_type);
VERIFY_NOT_REACHED();
@@ -810,6 +814,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, Match
}
}
+ALWAYS_INLINE void OpCode_Compare::compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched)
+{
+ if (state.string_position == input.view.length())
+ return;
+
+ u32 code_point = input.view[state.string_position_in_code_units];
+ bool equal = Unicode::code_point_has_script_extension(code_point, script);
+
+ if (equal) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ advance_string_position(state, input.view, code_point);
+ }
+}
+
String const OpCode_Compare::arguments_string() const
{
return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index 51c08bc84c..c1c7af235b 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType {
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
enum class CharacterCompareType : ByteCodeValueType {
@@ -729,6 +730,7 @@ private:
ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
+ ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
};
template<typename T>
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 44c264c619..6fe6519dde 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -1556,8 +1556,11 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
[&](Unicode::GeneralCategory general_category) {
compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
},
- [&](Unicode::Script script) {
- compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script });
+ [&](Script script) {
+ if (script.is_extension)
+ compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
+ else
+ compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
});
stack.insert_bytecode_compare_values(move(compares));
match_length_minimum += 1;
@@ -1716,6 +1719,7 @@ struct CharClassRangeElement {
bool is_property { false };
bool is_general_category { false };
bool is_script { false };
+ bool is_script_extension { false };
};
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
@@ -1810,8 +1814,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
[&](Unicode::GeneralCategory general_category) {
return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
},
- [&](Unicode::Script script) {
- return CharClassRangeElement { .script = script, .is_negated = negated, .is_character_class = true, .is_script = true };
+ [&](Script script) {
+ if (script.is_extension)
+ return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true };
+ else
+ return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true };
});
}
}
@@ -1861,6 +1868,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
else if (atom.is_script)
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
+ else if (atom.is_script_extension)
+ ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) });
else
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
} else {
@@ -1960,7 +1969,7 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
return true;
},
[](Unicode::GeneralCategory) { return true; },
- [](Unicode::Script) { return true; });
+ [](Script) { return true; });
}
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
@@ -2026,7 +2035,10 @@ Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_esc
return { *general_category };
} else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
if (auto script = Unicode::script_from_string(property_name); script.has_value())
- return { *script };
+ return Script { *script, false };
+ } else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) {
+ if (auto script = Unicode::script_from_string(property_name); script.has_value())
+ return Script { *script, true };
}
return {};
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index e07f36627f..ec5cb12f1c 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -214,7 +214,11 @@ private:
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
- using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Unicode::Script>;
+ struct Script {
+ Unicode::Script script {};
+ bool is_extension { false };
+ };
+ using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script>;
Optional<PropertyEscape> read_unicode_property_escape();
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);