diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-04 07:26:25 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-08-04 13:50:32 +0100 |
commit | 484ccfadc366545d6969947a6d9fb48cb88be3cf (patch) | |
tree | 7d51794bc1bbbc116e4951c6d567622c200dd8f2 /Userland | |
parent | 5edd4584206405a300209592ceb3edf65fed34e7 (diff) | |
download | serenity-484ccfadc366545d6969947a6d9fb48cb88be3cf.zip |
LibRegex: Support property escapes of Unicode script extensions
Diffstat (limited to 'Userland')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 20 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 24 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 6 |
4 files changed, 45 insertions, 7 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index e9a6deab02..7cad14ba92 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -573,6 +573,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++)); compare_script(input, state, script, current_inversion_state(), inverse_matched); + } else if (compare_type == CharacterCompareType::ScriptExtension) { + auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++)); + compare_script_extension(input, state, script, current_inversion_state(), inverse_matched); + } else { warnln("Undefined comparison: {}", (int)compare_type); VERIFY_NOT_REACHED(); @@ -810,6 +814,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, Match } } +ALWAYS_INLINE void OpCode_Compare::compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched) +{ + if (state.string_position == input.view.length()) + return; + + u32 code_point = input.view[state.string_position_in_code_units]; + bool equal = Unicode::code_point_has_script_extension(code_point, script); + + if (equal) { + if (inverse) + inverse_matched = true; + else + advance_string_position(state, input.view, code_point); + } +} + String const OpCode_Compare::arguments_string() const { return String::formatted("argc={}, args={} ", arguments_count(), arguments_size()); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index 51c08bc84c..c1c7af235b 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) enum class CharacterCompareType : ByteCodeValueType { @@ -729,6 +730,7 @@ private: ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); + ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched); }; template<typename T> diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 44c264c619..6fe6519dde 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1556,8 +1556,11 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini [&](Unicode::GeneralCategory general_category) { compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); }, - [&](Unicode::Script script) { - compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script }); + [&](Script script) { + if (script.is_extension) + compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script }); + else + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script }); }); stack.insert_bytecode_compare_values(move(compares)); match_length_minimum += 1; @@ -1716,6 +1719,7 @@ struct CharClassRangeElement { bool is_property { false }; bool is_general_category { false }; bool is_script { false }; + bool is_script_extension { false }; }; bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode) @@ -1810,8 +1814,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& [&](Unicode::GeneralCategory general_category) { return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true }; }, - [&](Unicode::Script script) { - return CharClassRangeElement { .script = script, .is_negated = negated, .is_character_class = true, .is_script = true }; + [&](Script script) { + if (script.is_extension) + return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true }; + else + return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true }; }); } } @@ -1861,6 +1868,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) }); else if (atom.is_script) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) }); + else if (atom.is_script_extension) + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) }); else ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class }); } else { @@ -1960,7 +1969,7 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool return true; }, [](Unicode::GeneralCategory) { return true; }, - [](Unicode::Script) { return true; }); + [](Script) { return true; }); } StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) @@ -2026,7 +2035,10 @@ Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_esc return { *general_category }; } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) { if (auto script = Unicode::script_from_string(property_name); script.has_value()) - return { *script }; + return Script { *script, false }; + } else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) { + if (auto script = Unicode::script_from_string(property_name); script.has_value()) + return Script { *script, true }; } return {}; diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index e07f36627f..ec5cb12f1c 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -214,7 +214,11 @@ private: Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1); StringView read_capture_group_specifier(bool take_starting_angle_bracket = false); - using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Unicode::Script>; + struct Script { + Unicode::Script script {}; + bool is_extension { false }; + }; + using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script>; Optional<PropertyEscape> read_unicode_property_escape(); bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named); |