summaryrefslogtreecommitdiff
path: root/Userland/Libraries
diff options
context:
space:
mode:
authorAli Mohammad Pur <ali.mpfard@gmail.com>2022-01-25 13:30:27 +0330
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2022-01-26 00:53:09 +0330
commit5fac41f733ace1135412c00327d8cc8d21c342ff (patch)
tree921378a50b3200ceb75839dadecea9a70f8febe9 /Userland/Libraries
parent98183ef572298e5541b1cdabc2021c1cd24f3373 (diff)
downloadserenity-5fac41f733ace1135412c00327d8cc8d21c342ff.zip
LibRegex: Implement ECMA262 multiline matching without splitting lines
As ECMA262 regex allows `[^]` and literal newlines to match newlines in the input string, we shouldn't split the input string into lines, rather simply make boundaries and catchall patterns capable of checking for these conditions specifically.
Diffstat (limited to 'Userland/Libraries')
-rw-r--r--Userland/Libraries/LibC/regex.h2
-rw-r--r--Userland/Libraries/LibRegex/RegexByteCode.cpp41
-rw-r--r--Userland/Libraries/LibRegex/RegexMatcher.cpp10
-rw-r--r--Userland/Libraries/LibRegex/RegexOptions.h3
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.cpp3
-rw-r--r--Userland/Libraries/LibRegex/RegexParser.h14
6 files changed, 53 insertions, 20 deletions
diff --git a/Userland/Libraries/LibC/regex.h b/Userland/Libraries/LibC/regex.h
index 3697286de6..a01ea1d3c4 100644
--- a/Userland/Libraries/LibC/regex.h
+++ b/Userland/Libraries/LibC/regex.h
@@ -83,6 +83,7 @@ enum __RegexAllFlags {
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
__Regex_Internal_Stateful = __Regex_Global << 14, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 15, // Internal flag; enable browser-specific ECMA262 extensions.
+ __Regex_Internal_ConsiderNewline = __Regex_Global << 16, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Last = __Regex_SkipTrimEmptyMatches
};
@@ -97,7 +98,6 @@ enum __RegexAllFlags {
#define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string.
#define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string.
-//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small")
#define REG_SEARCH __Regex_Last << 1
int regcomp(regex_t*, const char*, int);
diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp
index e0e6cbe175..eb9b19dbc9 100644
--- a/Userland/Libraries/LibRegex/RegexByteCode.cpp
+++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp
@@ -273,12 +273,23 @@ ALWAYS_INLINE ExecutionResult OpCode_ForkReplaceStay::execute(MatchInput const&
ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state) const
{
- if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
+ auto is_at_line_boundary = [&] {
+ if (state.string_position == 0)
+ return true;
+
+ if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) {
+ auto input_view = input.view.substring_view(state.string_position - 1, 1)[0];
+ return input_view == '\n';
+ }
+
+ return false;
+ }();
+ if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine))
return ExecutionResult::Failed_ExecuteLowPrioForks;
- if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
- || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
- || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
+ if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
+ || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine))
+ || (is_at_line_boundary && (input.regex_options & AllFlags::Global)))
return ExecutionResult::Continue;
return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -315,11 +326,22 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state) const
{
- if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
+ auto is_at_line_boundary = [&] {
+ if (state.string_position == input.view.length())
+ return true;
+
+ if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) {
+ auto input_view = input.view.substring_view(state.string_position, 1)[0];
+ return input_view == '\n';
+ }
+
+ return false;
+ }();
+ if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine))
return ExecutionResult::Failed_ExecuteLowPrioForks;
- if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
- || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
+ if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotEndOfLine))
+ || (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
return ExecutionResult::Continue;
return ExecutionResult::Failed_ExecuteLowPrioForks;
@@ -461,8 +483,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
if (input.view.length() <= state.string_position)
return ExecutionResult::Failed_ExecuteLowPrioForks;
- VERIFY(!current_inversion_state());
- advance_string_position(state, input.view);
+ auto input_view = input.view.substring_view(state.string_position, 1)[0];
+ if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)))
+ advance_string_position(state, input.view, input_view);
} else if (compare_type == CharacterCompareType::String) {
VERIFY(!current_inversion_state());
diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp
index 16fa1b4ec1..ddd8522f69 100644
--- a/Userland/Libraries/LibRegex/RegexMatcher.cpp
+++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp
@@ -41,7 +41,7 @@ Regex<Parser>::Regex(String pattern, typename ParserTraits<Parser>::OptionsType
run_optimization_passes();
if (parser_result.error == regex::Error::NoError)
- matcher = make<Matcher<Parser>>(this, regex_options);
+ matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
}
template<class Parser>
@@ -51,7 +51,7 @@ Regex<Parser>::Regex(regex::Parser::Result parse_result, String pattern, typenam
{
run_optimization_passes();
if (parser_result.error == regex::Error::NoError)
- matcher = make<Matcher<Parser>>(this, regex_options);
+ matcher = make<Matcher<Parser>>(this, regex_options | static_cast<decltype(regex_options.value())>(parse_result.options.value()));
}
template<class Parser>
@@ -104,8 +104,10 @@ RegexResult Matcher<Parser>::match(RegexStringView view, Optional<typename Parse
{
AllOptions options = m_regex_options | regex_options.value_or({}).value();
- if (options.has_flag_set(AllFlags::Multiline))
- return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
+ if constexpr (!IsSame<Parser, ECMA262>) {
+ if (options.has_flag_set(AllFlags::Multiline))
+ return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
+ }
Vector<RegexStringView> views;
views.append(view);
diff --git a/Userland/Libraries/LibRegex/RegexOptions.h b/Userland/Libraries/LibRegex/RegexOptions.h
index 24dba533ac..c9a3533c5b 100644
--- a/Userland/Libraries/LibRegex/RegexOptions.h
+++ b/Userland/Libraries/LibRegex/RegexOptions.h
@@ -16,7 +16,7 @@
namespace regex {
-using FlagsUnderlyingType = u16;
+using FlagsUnderlyingType = u32;
enum class AllFlags {
Global = __Regex_Global, // All matches (don't return after first match)
@@ -35,6 +35,7 @@ enum class AllFlags {
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
+ Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
Last = Internal_BrowserExtended,
};
diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp
index 61b5ff1843..564f57e612 100644
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@@ -194,7 +194,8 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options)
move(m_parser_state.match_length_minimum),
move(m_parser_state.error),
move(m_parser_state.error_token),
- m_parser_state.named_capture_groups.keys()
+ m_parser_state.named_capture_groups.keys(),
+ m_parser_state.regex_options,
};
}
diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h
index b7b3691835..3610d3c72a 100644
--- a/Userland/Libraries/LibRegex/RegexParser.h
+++ b/Userland/Libraries/LibRegex/RegexParser.h
@@ -54,6 +54,7 @@ public:
Error error;
Token error_token;
Vector<FlyString> capture_groups;
+ AllOptions options;
};
explicit Parser(Lexer& lexer)
@@ -71,6 +72,7 @@ public:
Result parse(Optional<AllOptions> regex_options = {});
bool has_error() const { return m_parser_state.error != Error::NoError; }
Error error() const { return m_parser_state.error; }
+ AllOptions options() const { return m_parser_state.regex_options; }
protected:
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
@@ -170,14 +172,16 @@ private:
};
class PosixExtendedParser final : public AbstractPosixParser {
+ constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
+
public:
explicit PosixExtendedParser(Lexer& lexer)
- : AbstractPosixParser(lexer)
+ : AbstractPosixParser(lexer, default_options)
{
}
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
- : AbstractPosixParser(lexer, regex_options.value_or({}))
+ : AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
{
}
@@ -195,15 +199,17 @@ private:
};
class ECMA262Parser final : public Parser {
+ constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
+
public:
explicit ECMA262Parser(Lexer& lexer)
- : Parser(lexer)
+ : Parser(lexer, default_options)
{
m_capture_groups_in_scope.empend();
}
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
- : Parser(lexer, regex_options.value_or({}))
+ : Parser(lexer, regex_options.value_or({}) | default_options.value())
{
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
m_capture_groups_in_scope.empend();