diff options
-rw-r--r-- | Tests/LibRegex/Regex.cpp | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibC/regex.h | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexByteCode.cpp | 41 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexMatcher.cpp | 10 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexOptions.h | 3 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.cpp | 3 | ||||
-rw-r--r-- | Userland/Libraries/LibRegex/RegexParser.h | 14 |
7 files changed, 55 insertions, 20 deletions
diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index c9bb6e1622..f8cc70dcd3 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -684,6 +684,8 @@ TEST_CASE(ECMA262_match) { "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern { "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails. { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too. + { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag. + { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^]. }; // clang-format on diff --git a/Userland/Libraries/LibC/regex.h b/Userland/Libraries/LibC/regex.h index 3697286de6..a01ea1d3c4 100644 --- a/Userland/Libraries/LibC/regex.h +++ b/Userland/Libraries/LibC/regex.h @@ -83,6 +83,7 @@ enum __RegexAllFlags { __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results. __Regex_Internal_Stateful = __Regex_Global << 14, // Internal flag; enables stateful matches. __Regex_Internal_BrowserExtended = __Regex_Global << 15, // Internal flag; enable browser-specific ECMA262 extensions. + __Regex_Internal_ConsiderNewline = __Regex_Global << 16, // Internal flag; allow matchers to consider newlines as line separators. __Regex_Last = __Regex_SkipTrimEmptyMatches }; @@ -97,7 +98,6 @@ enum __RegexAllFlags { #define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string. #define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string. -//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small") #define REG_SEARCH __Regex_Last << 1 int regcomp(regex_t*, const char*, int); diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index e0e6cbe175..eb9b19dbc9 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -273,12 +273,23 @@ ALWAYS_INLINE ExecutionResult OpCode_ForkReplaceStay::execute(MatchInput const& ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state) const { - if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine)) + auto is_at_line_boundary = [&] { + if (state.string_position == 0) + return true; + + if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) { + auto input_view = input.view.substring_view(state.string_position - 1, 1)[0]; + return input_view == '\n'; + } + + return false; + }(); + if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine)) return ExecutionResult::Failed_ExecuteLowPrioForks; - if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine)) - || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine)) - || (0 == state.string_position && (input.regex_options & AllFlags::Global))) + if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotBeginOfLine)) + || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine)) + || (is_at_line_boundary && (input.regex_options & AllFlags::Global))) return ExecutionResult::Continue; return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -315,11 +326,22 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state) const { - if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine)) + auto is_at_line_boundary = [&] { + if (state.string_position == input.view.length()) + return true; + + if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) { + auto input_view = input.view.substring_view(state.string_position, 1)[0]; + return input_view == '\n'; + } + + return false; + }(); + if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine)) return ExecutionResult::Failed_ExecuteLowPrioForks; - if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine)) - || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine))) + if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotEndOfLine)) + || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine))) return ExecutionResult::Continue; return ExecutionResult::Failed_ExecuteLowPrioForks; @@ -461,8 +483,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; - VERIFY(!current_inversion_state()); - advance_string_position(state, input.view); + auto input_view = input.view.substring_view(state.string_position, 1)[0]; + if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) + advance_string_position(state, input.view, input_view); } else if (compare_type == CharacterCompareType::String) { VERIFY(!current_inversion_state()); diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index 16fa1b4ec1..ddd8522f69 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -41,7 +41,7 @@ Regex<Parser>::Regex(String pattern, typename ParserTraits<Parser>::OptionsType run_optimization_passes(); if (parser_result.error == regex::Error::NoError) - matcher = make<Matcher<Parser>>(this, regex_options); + matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value())); } template<class Parser> @@ -51,7 +51,7 @@ Regex<Parser>::Regex(regex::Parser::Result parse_result, String pattern, typenam { run_optimization_passes(); if (parser_result.error == regex::Error::NoError) - matcher = make<Matcher<Parser>>(this, regex_options); + matcher = make<Matcher<Parser>>(this, regex_options | static_cast<decltype(regex_options.value())>(parse_result.options.value())); } template<class Parser> @@ -104,8 +104,10 @@ RegexResult Matcher<Parser>::match(RegexStringView view, Optional<typename Parse { AllOptions options = m_regex_options | regex_options.value_or({}).value(); - if (options.has_flag_set(AllFlags::Multiline)) - return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string... + if constexpr (!IsSame<Parser, ECMA262>) { + if (options.has_flag_set(AllFlags::Multiline)) + return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string... + } Vector<RegexStringView> views; views.append(view); diff --git a/Userland/Libraries/LibRegex/RegexOptions.h b/Userland/Libraries/LibRegex/RegexOptions.h index 24dba533ac..c9a3533c5b 100644 --- a/Userland/Libraries/LibRegex/RegexOptions.h +++ b/Userland/Libraries/LibRegex/RegexOptions.h @@ -16,7 +16,7 @@ namespace regex { -using FlagsUnderlyingType = u16; +using FlagsUnderlyingType = u32; enum class AllFlags { Global = __Regex_Global, // All matches (don't return after first match) @@ -35,6 +35,7 @@ enum class AllFlags { SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results. Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off. Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec. + Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries. Last = Internal_BrowserExtended, }; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 61b5ff1843..564f57e612 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -194,7 +194,8 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options) move(m_parser_state.match_length_minimum), move(m_parser_state.error), move(m_parser_state.error_token), - m_parser_state.named_capture_groups.keys() + m_parser_state.named_capture_groups.keys(), + m_parser_state.regex_options, }; } diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index b7b3691835..3610d3c72a 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -54,6 +54,7 @@ public: Error error; Token error_token; Vector<FlyString> capture_groups; + AllOptions options; }; explicit Parser(Lexer& lexer) @@ -71,6 +72,7 @@ public: Result parse(Optional<AllOptions> regex_options = {}); bool has_error() const { return m_parser_state.error != Error::NoError; } Error error() const { return m_parser_state.error; } + AllOptions options() const { return m_parser_state.regex_options; } protected: virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0; @@ -170,14 +172,16 @@ private: }; class PosixExtendedParser final : public AbstractPosixParser { + constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline); + public: explicit PosixExtendedParser(Lexer& lexer) - : AbstractPosixParser(lexer) + : AbstractPosixParser(lexer, default_options) { } PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options) - : AbstractPosixParser(lexer, regex_options.value_or({})) + : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value()) { } @@ -195,15 +199,17 @@ private: }; class ECMA262Parser final : public Parser { + constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline); + public: explicit ECMA262Parser(Lexer& lexer) - : Parser(lexer) + : Parser(lexer, default_options) { m_capture_groups_in_scope.empend(); } ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options) - : Parser(lexer, regex_options.value_or({})) + : Parser(lexer, regex_options.value_or({}) | default_options.value()) { m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended); m_capture_groups_in_scope.empend(); |