LibRegex: Support property escapes of Unicode script extensions

author: Timothy Flynn <trflynn89@pm.me> 2021-08-04 07:26:25 -0400
committer: Linus Groh <mail@linusgroh.de> 2021-08-04 13:50:32 +0100
commit: 484ccfadc366545d6969947a6d9fb48cb88be3cf (patch)
tree: 7d51794bc1bbbc116e4951c6d567622c200dd8f2 /Userland
parent: 5edd4584206405a300209592ceb3edf65fed34e7 (diff)
download: serenity-484ccfadc366545d6969947a6d9fb48cb88be3cf.zip
4 files changed, 45 insertions, 7 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index e9a6deab02..7cad14ba92 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -573,6 +573,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
             auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
             compare_script(input, state, script, current_inversion_state(), inverse_matched);
 
+        } else if (compare_type == CharacterCompareType::ScriptExtension) {
+            auto script = static_cast<Unicode::Script>(m_bytecode->at(offset++));
+            compare_script_extension(input, state, script, current_inversion_state(), inverse_matched);
+
         } else {
             warnln("Undefined comparison: {}", (int)compare_type);
             VERIFY_NOT_REACHED();
@@ -810,6 +814,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_script(MatchInput const& input, Match
     }
 }
 
+ALWAYS_INLINE void OpCode_Compare::compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched)
+{
+    if (state.string_position == input.view.length())
+        return;
+
+    u32 code_point = input.view[state.string_position_in_code_units];
+    bool equal = Unicode::code_point_has_script_extension(code_point, script);
+
+    if (equal) {
+        if (inverse)
+            inverse_matched = true;
+        else
+            advance_string_position(state, input.view, code_point);
+    }
+}
+
 String const OpCode_Compare::arguments_string() const
 {
     return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index 51c08bc84c..c1c7af235b 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -69,6 +69,7 @@ enum class OpCodeId : ByteCodeValueType {
     __ENUMERATE_CHARACTER_COMPARE_TYPE(Property)         \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory)  \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(Script)           \
+    __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension)  \
     __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
 
 enum class CharacterCompareType : ByteCodeValueType {
@@ -729,6 +730,7 @@ private:
     ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
     ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
+    ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
 };
 
 template<typename T>
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 44c264c619..6fe6519dde 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -1556,8 +1556,11 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
                 [&](Unicode::GeneralCategory general_category) {
                     compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
                 },
-                [&](Unicode::Script script) {
-                    compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script });
+                [&](Script script) {
+                    if (script.is_extension)
+                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
+                    else
+                        compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
                 });
             stack.insert_bytecode_compare_values(move(compares));
             match_length_minimum += 1;
@@ -1716,6 +1719,7 @@ struct CharClassRangeElement {
     bool is_property { false };
     bool is_general_category { false };
     bool is_script { false };
+    bool is_script_extension { false };
 };
 
 bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
@@ -1810,8 +1814,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                         [&](Unicode::GeneralCategory general_category) {
                             return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
                         },
-                        [&](Unicode::Script script) {
-                            return CharClassRangeElement { .script = script, .is_negated = negated, .is_character_class = true, .is_script = true };
+                        [&](Script script) {
+                            if (script.is_extension)
+                                return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true };
+                            else
+                                return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true };
                         });
                 }
             }
@@ -1861,6 +1868,8 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
             else if (atom.is_script)
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
+            else if (atom.is_script_extension)
+                ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) });
             else
                 ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
         } else {
@@ -1960,7 +1969,7 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
             return true;
         },
         [](Unicode::GeneralCategory) { return true; },
-        [](Unicode::Script) { return true; });
+        [](Script) { return true; });
 }
 
 StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
@@ -2026,7 +2035,10 @@ Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_esc
             return { *general_category };
     } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
         if (auto script = Unicode::script_from_string(property_name); script.has_value())
-            return { *script };
+            return Script { *script, false };
+    } else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) {
+        if (auto script = Unicode::script_from_string(property_name); script.has_value())
+            return Script { *script, true };
     }
 
     return {};
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index e07f36627f..ec5cb12f1c 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -214,7 +214,11 @@ private:
     Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
     StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
 
-    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Unicode::Script>;
+    struct Script {
+        Unicode::Script script {};
+        bool is_extension { false };
+    };
+    using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory, Script>;
     Optional<PropertyEscape> read_unicode_property_escape();
 
     bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
author	Timothy Flynn <trflynn89@pm.me>	2021-08-04 07:26:25 -0400
committer	Linus Groh <mail@linusgroh.de>	2021-08-04 13:50:32 +0100
commit	484ccfadc366545d6969947a6d9fb48cb88be3cf (patch)
tree	7d51794bc1bbbc116e4951c6d567622c200dd8f2 /Userland
parent	5edd4584206405a300209592ceb3edf65fed34e7 (diff)
download	serenity-484ccfadc366545d6969947a6d9fb48cb88be3cf.zip