summaryrefslogtreecommitdiff
path: root/Userland/Libraries
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-07-29 10:34:37 -0400
committerLinus Groh <mail@linusgroh.de>2021-07-30 21:26:31 +0100
commitf1dd770a8a710c086e357e6fb89a28e27795f998 (patch)
treee76e0882cb68ec6fe53c77cf5a250488eca52de7 /Userland/Libraries
parent1400e3cf58c14594f06d98db25e10f908480ce67 (diff)
downloadserenity-f1dd770a8a710c086e357e6fb89a28e27795f998.zip
LibJS: Parse RegExp literals at AST creation time, not execution time
The spec requires that invalid RegExp literals must cause a Syntax Error before the JavaScript is executed. See: https://tc39.es/ecma262/#sec-patterns-static-semantics-early-errors This is explicitly tested in the RegExp/property-escapes test262 tests. For example, see unsupported-property-Line_Break.js: $DONOTEVALUATE(); /\p{Line_Break}/u; That RegExp literal is invalid because Line_Break is not a supported Unicode property. $DONOTEVALUATE() just throws an exception when it is executed. The test expects that this file will fail to be parsed. Note that RegExp patterns can still be parsed at execution time by way of "new RegExp(...)".
Diffstat (limited to 'Userland/Libraries')
-rw-r--r--Userland/Libraries/LibJS/AST.cpp4
-rw-r--r--Userland/Libraries/LibJS/AST.h12
-rw-r--r--Userland/Libraries/LibJS/Parser.cpp30
-rw-r--r--Userland/Libraries/LibJS/Runtime/RegExpObject.cpp134
-rw-r--r--Userland/Libraries/LibJS/Runtime/RegExpObject.h24
-rw-r--r--Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js8
6 files changed, 121 insertions, 91 deletions
diff --git a/Userland/Libraries/LibJS/AST.cpp b/Userland/Libraries/LibJS/AST.cpp
index 5867f5b56d..32f7f21f5f 100644
--- a/Userland/Libraries/LibJS/AST.cpp
+++ b/Userland/Libraries/LibJS/AST.cpp
@@ -2020,7 +2020,9 @@ void RegExpLiteral::dump(int indent) const
Value RegExpLiteral::execute(Interpreter& interpreter, GlobalObject& global_object) const
{
InterpreterNodeScope node_scope { interpreter, *this };
- return regexp_create(global_object, js_string(interpreter.heap(), pattern()), js_string(interpreter.heap(), flags()));
+
+ Regex<ECMA262> regex(parsed_regex(), parsed_pattern(), parsed_flags());
+ return RegExpObject::create(global_object, move(regex), pattern(), flags());
}
void ArrayExpression::dump(int indent) const
diff --git a/Userland/Libraries/LibJS/AST.h b/Userland/Libraries/LibJS/AST.h
index 7235cba96d..1ae404b175 100644
--- a/Userland/Libraries/LibJS/AST.h
+++ b/Userland/Libraries/LibJS/AST.h
@@ -19,6 +19,7 @@
#include <LibJS/Runtime/PropertyName.h>
#include <LibJS/Runtime/Value.h>
#include <LibJS/SourceRange.h>
+#include <LibRegex/Regex.h>
namespace JS {
@@ -758,8 +759,11 @@ public:
class RegExpLiteral final : public Literal {
public:
- explicit RegExpLiteral(SourceRange source_range, String pattern, String flags)
+ RegExpLiteral(SourceRange source_range, regex::Parser::Result parsed_regex, String parsed_pattern, regex::RegexOptions<ECMAScriptFlags> parsed_flags, String pattern, String flags)
: Literal(source_range)
+ , m_parsed_regex(move(parsed_regex))
+ , m_parsed_pattern(move(parsed_pattern))
+ , m_parsed_flags(move(parsed_flags))
, m_pattern(move(pattern))
, m_flags(move(flags))
{
@@ -769,10 +773,16 @@ public:
virtual void dump(int indent) const override;
virtual void generate_bytecode(Bytecode::Generator&) const override;
+ regex::Parser::Result const& parsed_regex() const { return m_parsed_regex; }
+ String const& parsed_pattern() const { return m_parsed_pattern; }
+ regex::RegexOptions<ECMAScriptFlags> const& parsed_flags() const { return m_parsed_flags; }
String const& pattern() const { return m_pattern; }
String const& flags() const { return m_flags; }
private:
+ regex::Parser::Result m_parsed_regex;
+ String m_parsed_pattern;
+ regex::RegexOptions<ECMAScriptFlags> m_parsed_flags;
String m_pattern;
String m_flags;
};
diff --git a/Userland/Libraries/LibJS/Parser.cpp b/Userland/Libraries/LibJS/Parser.cpp
index 7018375c5e..b5d9a67c56 100644
--- a/Userland/Libraries/LibJS/Parser.cpp
+++ b/Userland/Libraries/LibJS/Parser.cpp
@@ -12,6 +12,8 @@
#include <AK/ScopeGuard.h>
#include <AK/StdLibExtras.h>
#include <AK/TemporaryChange.h>
+#include <LibJS/Runtime/RegExpObject.h>
+#include <LibRegex/Regex.h>
namespace JS {
@@ -848,21 +850,29 @@ NonnullRefPtr<RegExpLiteral> Parser::parse_regexp_literal()
auto pattern = consume().value();
// Remove leading and trailing slash.
pattern = pattern.substring_view(1, pattern.length() - 2);
+
auto flags = String::empty();
+ auto parsed_flags = RegExpObject::default_flags;
+
if (match(TokenType::RegexFlags)) {
auto flags_start = position();
flags = consume().value();
- HashTable<char> seen_flags;
- for (size_t i = 0; i < flags.length(); ++i) {
- auto flag = flags.substring_view(i, 1);
- if (!flag.is_one_of("d", "g", "i", "m", "s", "u", "y"))
- syntax_error(String::formatted("Invalid RegExp flag '{}'", flag), Position { flags_start.line, flags_start.column + i });
- if (seen_flags.contains(*flag.characters_without_null_termination()))
- syntax_error(String::formatted("Repeated RegExp flag '{}'", flag), Position { flags_start.line, flags_start.column + i });
- seen_flags.set(*flag.characters_without_null_termination());
- }
+
+ auto parsed_flags_or_error = regex_flags_from_string(flags);
+ if (parsed_flags_or_error.is_error())
+ syntax_error(parsed_flags_or_error.release_error(), flags_start);
+ else
+ parsed_flags = parsed_flags_or_error.release_value();
}
- return create_ast_node<RegExpLiteral>({ m_state.current_token.filename(), rule_start.position(), position() }, pattern, flags);
+
+ auto parsed_pattern = parse_regex_pattern(pattern, parsed_flags.has_flag_set(ECMAScriptFlags::Unicode));
+ auto parsed_regex = Regex<ECMA262>::parse_pattern(parsed_pattern, parsed_flags);
+
+ if (parsed_regex.error != regex::Error::NoError)
+ syntax_error(String::formatted("RegExp compile error: {}", Regex<ECMA262>(parsed_regex, parsed_pattern, parsed_flags).error_string()), rule_start.position());
+
+ SourceRange range { m_state.current_token.filename(), rule_start.position(), position() };
+ return create_ast_node<RegExpLiteral>(move(range), move(parsed_regex), move(parsed_pattern), move(parsed_flags), pattern.to_string(), move(flags));
}
NonnullRefPtr<Expression> Parser::parse_unary_prefixed_expression()
diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp
index 8955c82b3a..9e2832667a 100644
--- a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp
+++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp
@@ -14,97 +14,108 @@
namespace JS {
-static Flags options_from(GlobalObject& global_object, const String& flags)
+Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(StringView flags)
{
- auto& vm = global_object.vm();
bool d = false, g = false, i = false, m = false, s = false, u = false, y = false;
- Flags options {
- // JS regexps are all 'global' by default as per our definition, but the "global" flag enables "stateful".
- // FIXME: Enable 'BrowserExtended' only if in a browser context.
- .effective_flags = { (regex::ECMAScriptFlags)regex::AllFlags::Global | (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches | regex::ECMAScriptFlags::BrowserExtended },
- .declared_flags = {},
- };
+ auto options = RegExpObject::default_flags;
for (auto ch : flags) {
switch (ch) {
case 'd':
if (d)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
d = true;
break;
case 'g':
if (g)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
g = true;
- options.effective_flags |= regex::ECMAScriptFlags::Global;
- options.declared_flags |= regex::ECMAScriptFlags::Global;
+ options |= regex::ECMAScriptFlags::Global;
break;
case 'i':
if (i)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
i = true;
- options.effective_flags |= regex::ECMAScriptFlags::Insensitive;
- options.declared_flags |= regex::ECMAScriptFlags::Insensitive;
+ options |= regex::ECMAScriptFlags::Insensitive;
break;
case 'm':
if (m)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
m = true;
- options.effective_flags |= regex::ECMAScriptFlags::Multiline;
- options.declared_flags |= regex::ECMAScriptFlags::Multiline;
+ options |= regex::ECMAScriptFlags::Multiline;
break;
case 's':
if (s)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
s = true;
- options.effective_flags |= regex::ECMAScriptFlags::SingleLine;
- options.declared_flags |= regex::ECMAScriptFlags::SingleLine;
+ options |= regex::ECMAScriptFlags::SingleLine;
break;
case 'u':
if (u)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
u = true;
- options.effective_flags |= regex::ECMAScriptFlags::Unicode;
- options.declared_flags |= regex::ECMAScriptFlags::Unicode;
+ options |= regex::ECMAScriptFlags::Unicode;
break;
case 'y':
if (y)
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectRepeatedFlag, ch);
+ return String::formatted(ErrorType::RegExpObjectRepeatedFlag.message(), ch);
y = true;
// Now for the more interesting flag, 'sticky' actually unsets 'global', part of which is the default.
- options.effective_flags.reset_flag(regex::ECMAScriptFlags::Global);
+ options.reset_flag(regex::ECMAScriptFlags::Global);
// "What's the difference between sticky and global, then", that's simple.
// all the other flags imply 'global', and the "global" flag implies 'stateful';
// however, the "sticky" flag does *not* imply 'global', only 'stateful'.
- options.effective_flags |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful;
- options.effective_flags |= regex::ECMAScriptFlags::Sticky;
- options.declared_flags |= regex::ECMAScriptFlags::Sticky;
+ options |= (regex::ECMAScriptFlags)regex::AllFlags::Internal_Stateful;
+ options |= regex::ECMAScriptFlags::Sticky;
break;
default:
- vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpObjectBadFlag, ch);
- return options;
+ return String::formatted(ErrorType::RegExpObjectBadFlag.message(), ch);
}
}
return options;
}
-RegExpObject* RegExpObject::create(GlobalObject& global_object, String original_pattern, String parsed_pattern, String flags)
+String parse_regex_pattern(StringView pattern, bool unicode)
+{
+ auto utf16_pattern = AK::utf8_to_utf16(pattern);
+ Utf16View utf16_pattern_view { utf16_pattern };
+ StringBuilder builder;
+
+ // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
+ // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
+ for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
+ if (unicode) {
+ auto code_point = code_point_at(utf16_pattern_view, i);
+ builder.append_code_point(code_point.code_point);
+ i += code_point.code_unit_count;
+ continue;
+ }
+
+ u16 code_unit = utf16_pattern_view.code_unit_at(i);
+ ++i;
+
+ if (code_unit > 0x7f)
+ builder.appendff("\\u{:04x}", code_unit);
+ else
+ builder.append_code_point(code_unit);
+ }
+
+ return builder.build();
+}
+
+RegExpObject* RegExpObject::create(GlobalObject& global_object, Regex<ECMA262> regex, String pattern, String flags)
{
- return global_object.heap().allocate<RegExpObject>(global_object, move(original_pattern), move(parsed_pattern), move(flags), *global_object.regexp_prototype());
+ return global_object.heap().allocate<RegExpObject>(global_object, move(regex), move(pattern), move(flags), *global_object.regexp_prototype());
}
-RegExpObject::RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype)
+RegExpObject::RegExpObject(Regex<ECMA262> regex, String pattern, String flags, Object& prototype)
: Object(prototype)
- , m_original_pattern(move(original_pattern))
- , m_parsed_pattern(move(parsed_pattern))
+ , m_pattern(move(pattern))
, m_flags(move(flags))
- , m_active_flags(options_from(global_object(), m_flags))
- , m_regex(m_parsed_pattern, m_active_flags.effective_flags)
+ , m_regex(move(regex))
{
- if (m_regex.parser_result.error != regex::Error::NoError) {
- vm().throw_exception<SyntaxError>(global_object(), ErrorType::RegExpCompileError, m_regex.error_string());
- }
+ VERIFY(m_regex.parser_result.error == regex::Error::NoError);
}
RegExpObject::~RegExpObject()
@@ -115,7 +126,7 @@ void RegExpObject::initialize(GlobalObject& global_object)
{
auto& vm = this->vm();
Object::initialize(global_object);
- define_direct_property(vm.names.lastIndex, {}, Attribute::Writable);
+ define_direct_property(vm.names.lastIndex, Value(0), Attribute::Writable);
}
// 22.2.3.2.4 RegExpCreate ( P, F ), https://tc39.es/ecma262/#sec-regexpcreate
@@ -139,38 +150,27 @@ RegExpObject* regexp_create(GlobalObject& global_object, Value pattern, Value fl
original_pattern = String::empty();
parsed_pattern = String::empty();
} else {
- auto utf16_pattern = pattern.to_utf16_string(global_object);
+ original_pattern = pattern.to_string(global_object);
if (vm.exception())
return {};
- Utf16View utf16_pattern_view { utf16_pattern };
bool unicode = f.find('u').has_value();
- StringBuilder builder;
-
- // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
- // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
- for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
- if (unicode) {
- auto code_point = code_point_at(utf16_pattern_view, i);
- builder.append_code_point(code_point.code_point);
- i += code_point.code_unit_count;
- continue;
- }
-
- u16 code_unit = utf16_pattern_view.code_unit_at(i);
- ++i;
-
- if (code_unit > 0x7f)
- builder.appendff("\\u{:04x}", code_unit);
- else
- builder.append_code_point(code_unit);
- }
+ parsed_pattern = parse_regex_pattern(original_pattern, unicode);
+ }
- original_pattern = utf16_pattern_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
- parsed_pattern = builder.build();
+ auto parsed_flags_or_error = regex_flags_from_string(f);
+ if (parsed_flags_or_error.is_error()) {
+ vm.throw_exception(global_object, SyntaxError::create(global_object, parsed_flags_or_error.release_error()));
+ return {};
+ }
+
+ Regex<ECMA262> regex(move(parsed_pattern), parsed_flags_or_error.release_value());
+ if (regex.parser_result.error != regex::Error::NoError) {
+ vm.throw_exception<SyntaxError>(global_object, ErrorType::RegExpCompileError, regex.error_string());
+ return {};
}
- auto* object = RegExpObject::create(global_object, move(original_pattern), move(parsed_pattern), move(f));
+ auto* object = RegExpObject::create(global_object, move(regex), move(original_pattern), move(f));
object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes);
if (vm.exception())
return {};
diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.h b/Userland/Libraries/LibJS/Runtime/RegExpObject.h
index 528a9f1129..d181d76303 100644
--- a/Userland/Libraries/LibJS/Runtime/RegExpObject.h
+++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.h
@@ -6,40 +6,40 @@
#pragma once
+#include <AK/Result.h>
#include <LibJS/AST.h>
#include <LibJS/Runtime/Object.h>
#include <LibRegex/Regex.h>
-struct Flags {
- regex::RegexOptions<ECMAScriptFlags> effective_flags;
- regex::RegexOptions<ECMAScriptFlags> declared_flags;
-};
-
namespace JS {
RegExpObject* regexp_create(GlobalObject&, Value pattern, Value flags);
+Result<regex::RegexOptions<ECMAScriptFlags>, String> regex_flags_from_string(StringView flags);
+String parse_regex_pattern(StringView pattern, bool unicode);
+
class RegExpObject : public Object {
JS_OBJECT(RegExpObject, Object);
public:
- static RegExpObject* create(GlobalObject&, String original_pattern, String parsed_pattern, String flags);
+ // JS regexps are all 'global' by default as per our definition, but the "global" flag enables "stateful".
+ // FIXME: Enable 'BrowserExtended' only if in a browser context.
+ static constexpr regex::RegexOptions<ECMAScriptFlags> default_flags { (regex::ECMAScriptFlags)regex::AllFlags::Global | (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches | regex::ECMAScriptFlags::BrowserExtended };
+
+ static RegExpObject* create(GlobalObject&, Regex<ECMA262> regex, String pattern, String flags);
- RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype);
+ RegExpObject(Regex<ECMA262> regex, String pattern, String flags, Object& prototype);
virtual void initialize(GlobalObject&) override;
virtual ~RegExpObject() override;
- const String& pattern() const { return m_original_pattern; }
+ const String& pattern() const { return m_pattern; }
const String& flags() const { return m_flags; }
- const regex::RegexOptions<ECMAScriptFlags>& declared_options() { return m_active_flags.declared_flags; }
const Regex<ECMA262>& regex() { return m_regex; }
const Regex<ECMA262>& regex() const { return m_regex; }
private:
- String m_original_pattern;
- String m_parsed_pattern;
+ String m_pattern;
String m_flags;
- Flags m_active_flags;
Regex<ECMA262> m_regex;
};
diff --git a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js
index 183d79284b..ccd2509b6e 100644
--- a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js
+++ b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js
@@ -52,3 +52,11 @@ test("regexp object as pattern parameter", () => {
expect(RegExp(regex_like_object_with_flags, "").toString()).toBe("/foo/");
expect(RegExp(regex_like_object_with_flags, "y").toString()).toBe("/foo/y");
});
+
+test("regexp literals are re-useable", () => {
+ for (var i = 0; i < 2; ++i) {
+ const re = /test/;
+ expect(re.test("te")).toBeFalse();
+ expect(re.test("test")).toBeTrue();
+ }
+});