summaryrefslogtreecommitdiff
path: root/Userland/Libraries
diff options
context:
space:
mode:
Diffstat (limited to 'Userland/Libraries')
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.cpp20
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.h2
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp62
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.h6
4 files changed, 71 insertions, 19 deletions
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index 329afb1456..20a65b3d0c 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -537,6 +537,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
compare_property(input, state, property, current_inversion_state(), inverse_matched);
+ } else if (compare_type == CharacterCompareType::GeneralCategory) {
+ auto general_category = static_cast<Unicode::GeneralCategory>(m_bytecode->at(offset++));
+ compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched);
+
} else {
warnln("Undefined comparison: {}", (int)compare_type);
VERIFY_NOT_REACHED();
@@ -742,6 +746,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat
}
}
+ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched)
+{
+ if (state.string_position == input.view.length())
+ return;
+
+ u32 code_point = input.view[state.string_position];
+ bool equal = Unicode::code_point_has_general_category(code_point, general_category);
+
+ if (equal) {
+ if (inverse)
+ inverse_matched = true;
+ else
+ ++state.string_position;
+ }
+}
+
String const OpCode_Compare::arguments_string() const
{
return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.h b/Userland/Libraries/LibRegex/RegexByteCode.h
index f86143fd71..ccf83e0c19 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.h
+++ b/Userland/Libraries/LibRegex/RegexByteCode.h
@@ -67,6 +67,7 @@ enum class OpCodeId : ByteCodeValueType {
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
+ __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
enum class CharacterCompareType : ByteCodeValueType {
@@ -725,6 +726,7 @@ private:
ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
+ ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
};
template<typename T>
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index a5574f11c4..fc433444a6 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -1542,13 +1542,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
if (unicode) {
- Unicode::Property property {};
+ PropertyEscape property {};
bool negated = false;
if (parse_unicode_property_escape(property, negated)) {
if (negated)
stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } });
- stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
+ property.visit(
+ [&](Unicode::Property property) {
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
+ },
+ [&](Unicode::GeneralCategory general_category) {
+ stack.insert_bytecode_compare_values({ { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(general_category) } });
+ });
return true;
}
}
@@ -1695,11 +1701,13 @@ struct CharClassRangeElement {
CharClass character_class;
u32 code_point { 0 };
Unicode::Property property;
+ Unicode::GeneralCategory general_category;
};
bool is_negated { false };
bool is_character_class { false };
- bool is_property_escape { false };
+ bool is_property { false };
+ bool is_general_category { false };
};
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
@@ -1784,10 +1792,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
if (try_skip("-"))
return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
- Unicode::Property property {};
+ PropertyEscape property {};
bool negated = false;
- if (parse_unicode_property_escape(property, negated))
- return { CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property_escape = true } };
+ if (parse_unicode_property_escape(property, negated)) {
+ return property.visit(
+ [&](Unicode::Property property) {
+ return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true };
+ },
+ [&](Unicode::GeneralCategory general_category) {
+ return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
+ });
+ }
}
if (try_skip("d"))
@@ -1828,8 +1843,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
if (atom.is_character_class) {
if (atom.is_negated)
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
- if (atom.is_property_escape)
+
+ if (atom.is_property)
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
+ else if (atom.is_general_category)
+ ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
else
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
} else {
@@ -1901,7 +1919,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
return true;
}
-bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, bool& negated)
+bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated)
{
negated = false;
@@ -1918,13 +1936,19 @@ bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, b
return false;
}
- if (!Unicode::is_ecma262_property(*parsed_property)) {
- set_error(Error::InvalidNameForProperty);
- return false;
- }
+ property = move(*parsed_property);
- property = *parsed_property;
- return true;
+ return property.visit(
+ [this](Unicode::Property property) {
+ if (!Unicode::is_ecma262_property(property)) {
+ set_error(Error::InvalidNameForProperty);
+ return false;
+ }
+ return true;
+ },
+ [](Unicode::GeneralCategory) {
+ return true;
+ });
}
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
@@ -1948,7 +1972,7 @@ StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_
return name;
}
-Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
+Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape()
{
consume(TokenType::LeftCurly, Error::InvalidPattern);
@@ -1960,10 +1984,14 @@ Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
offset += consume().value().length();
}
+ StringView property_name { start_token.value().characters_without_null_termination(), offset };
consume(TokenType::RightCurly, Error::InvalidPattern);
- StringView property_name { start_token.value().characters_without_null_termination(), offset };
- return Unicode::property_from_string(property_name);
+ if (auto property = Unicode::property_from_string(property_name); property.has_value())
+ return { *property };
+ if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
+ return { *general_category };
+ return {};
}
bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index a48ad90c4b..4693bbe0cf 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -213,7 +213,9 @@ private:
StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
- Optional<Unicode::Property> read_unicode_property_escape();
+
+ using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory>;
+ Optional<PropertyEscape> read_unicode_property_escape();
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
@@ -227,7 +229,7 @@ private:
bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
- bool parse_unicode_property_escape(Unicode::Property& property, bool& negated);
+ bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);