diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-07-31 17:46:05 -0400 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2021-08-02 21:02:09 +0430 |
commit | 1e10d6d7ce70dafd638747fadc6874a47d0448db (patch) | |
tree | ca46edbf4c9e18c6a600bc7649f59a2da32b5327 /Userland/Libraries | |
parent | 5de6d3dd9050fca8c68851d125efb4737aba18cd (diff) | |
download | serenity-1e10d6d7ce70dafd638747fadc6874a47d0448db.zip |
LibRegex: Support property escapes of Unicode General Categories
This changes LibRegex to parse the property escape as a Variant of
Unicode Property & General Category values. A byte code instruction is
added to perform matching based on General Category values.
Diffstat (limited to 'Userland/Libraries')
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 20 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 62 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 6 |
4 files changed, 71 insertions, 19 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 329afb1456..20a65b3d0c 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -537,6 +537,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++)); compare_property(input, state, property, current_inversion_state(), inverse_matched); + } else if (compare_type == CharacterCompareType::GeneralCategory) { + auto general_category = static_cast<Unicode::GeneralCategory>(m_bytecode->at(offset++)); + compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched); + } else { warnln("Undefined comparison: {}", (int)compare_type); VERIFY_NOT_REACHED(); @@ -742,6 +746,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat } } +ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched) +{ + if (state.string_position == input.view.length()) + return; + + u32 code_point = input.view[state.string_position]; + bool equal = Unicode::code_point_has_general_category(code_point, general_category); + + if (equal) { + if (inverse) + inverse_matched = true; + else + ++state.string_position; + } +} + String const OpCode_Compare::arguments_string() const { return String::formatted("argc={}, args={} ", arguments_count(), arguments_size()); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h index f86143fd71..ccf83e0c19 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.h +++ b/Userland/Libraries/LibRegex/RegexByteCode.h @@ -67,6 +67,7 @@ enum class OpCodeId : ByteCodeValueType { __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \ + __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \ __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) enum class CharacterCompareType : ByteCodeValueType { @@ -725,6 +726,7 @@ private: ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched); ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched); + ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched); }; template<typename T> diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index a5574f11c4..fc433444a6 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1542,13 +1542,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini } if (unicode) { - Unicode::Property property {}; + PropertyEscape property {}; bool negated = false; if (parse_unicode_property_escape(property, negated)) { if (negated) stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } }); - stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } }); + property.visit( + [&](Unicode::Property property) { + stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } }); + }, + [&](Unicode::GeneralCategory general_category) { + stack.insert_bytecode_compare_values({ { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(general_category) } }); + }); return true; } } @@ -1695,11 +1701,13 @@ struct CharClassRangeElement { CharClass character_class; u32 code_point { 0 }; Unicode::Property property; + Unicode::GeneralCategory general_category; }; bool is_negated { false }; bool is_character_class { false }; - bool is_property_escape { false }; + bool is_property { false }; + bool is_general_category { false }; }; bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode) @@ -1784,10 +1792,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& if (try_skip("-")) return { CharClassRangeElement { .code_point = '-', .is_character_class = false } }; - Unicode::Property property {}; + PropertyEscape property {}; bool negated = false; - if (parse_unicode_property_escape(property, negated)) - return { CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property_escape = true } }; + if (parse_unicode_property_escape(property, negated)) { + return property.visit( + [&](Unicode::Property property) { + return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true }; + }, + [&](Unicode::GeneralCategory general_category) { + return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true }; + }); + } } if (try_skip("d")) @@ -1828,8 +1843,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& if (atom.is_character_class) { if (atom.is_negated) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 }); - if (atom.is_property_escape) + + if (atom.is_property) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) }); + else if (atom.is_general_category) + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) }); else ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class }); } else { @@ -1901,7 +1919,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& return true; } -bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, bool& negated) +bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated) { negated = false; @@ -1918,13 +1936,19 @@ bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, b return false; } - if (!Unicode::is_ecma262_property(*parsed_property)) { - set_error(Error::InvalidNameForProperty); - return false; - } + property = move(*parsed_property); - property = *parsed_property; - return true; + return property.visit( + [this](Unicode::Property property) { + if (!Unicode::is_ecma262_property(property)) { + set_error(Error::InvalidNameForProperty); + return false; + } + return true; + }, + [](Unicode::GeneralCategory) { + return true; + }); } StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) @@ -1948,7 +1972,7 @@ StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_ return name; } -Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape() +Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape() { consume(TokenType::LeftCurly, Error::InvalidPattern); @@ -1960,10 +1984,14 @@ Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape() offset += consume().value().length(); } + StringView property_name { start_token.value().characters_without_null_termination(), offset }; consume(TokenType::RightCurly, Error::InvalidPattern); - StringView property_name { start_token.value().characters_without_null_termination(), offset }; - return Unicode::property_from_string(property_name); + if (auto property = Unicode::property_from_string(property_name); property.has_value()) + return { *property }; + if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value()) + return { *general_category }; + return {}; } bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named) diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index a48ad90c4b..4693bbe0cf 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -213,7 +213,9 @@ private: StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1); Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1); StringView read_capture_group_specifier(bool take_starting_angle_bracket = false); - Optional<Unicode::Property> read_unicode_property_escape(); + + using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory>; + Optional<PropertyEscape> read_unicode_property_escape(); bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named); bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named); @@ -227,7 +229,7 @@ private: bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named); Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false); bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode); - bool parse_unicode_property_escape(Unicode::Property& property, bool& negated); + bool parse_unicode_property_escape(PropertyEscape& property, bool& negated); // Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers) bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named); |