summaryrefslogtreecommitdiff
path: root/Userland/Libraries
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-07-22 10:38:10 -0400
committerLinus Groh <mail@linusgroh.de>2021-07-23 23:06:57 +0100
commit5a8f870594e3619317cf4cd584cbdaacd5ebcdef (patch)
tree230b7edd45e6127dc82859293a209b8d53057cf9 /Userland/Libraries
parentee7b04f7bb53ebc32d00b8804610769cdc3e8b05 (diff)
downloadserenity-5a8f870594e3619317cf4cd584cbdaacd5ebcdef.zip
LibJS: Implement RegExp.prototype [ @@replace ] with UTF-16 code units
This also converts the GetSubstitution abstract operation take its input strings as UTF-16 now that all callers are UTF-16 capable. This means String.prototype.replace (and replaceAll) no longer needs UTF-8 and UTF-16 copies of these strings.
Diffstat (limited to 'Userland/Libraries')
-rw-r--r--Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp45
-rw-r--r--Userland/Libraries/LibJS/Runtime/AbstractOperations.h2
-rw-r--r--Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp39
-rw-r--r--Userland/Libraries/LibJS/Runtime/StringPrototype.cpp25
-rw-r--r--Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js10
-rw-r--r--Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js17
6 files changed, 67 insertions, 71 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp b/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp
index 80ec6fb56f..b792abd799 100644
--- a/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp
+++ b/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp
@@ -576,59 +576,48 @@ Value canonical_numeric_index_string(GlobalObject& global_object, PropertyName c
}
// 22.1.3.17.1 GetSubstitution ( matched, str, position, captures, namedCaptures, replacement ), https://tc39.es/ecma262/#sec-getsubstitution
-String get_substitution(GlobalObject& global_object, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
+String get_substitution(GlobalObject& global_object, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement)
{
auto& vm = global_object.vm();
- auto replace_string = replacement.to_string(global_object);
+ auto replace_string = replacement.to_utf16_string(global_object);
if (vm.exception())
return {};
-
- // FIXME: Once RegExp.prototype supports UTF-16, this AO can take UTF-16 strings as parameters instead of having to transcode here.
- auto utf16_matched = AK::utf8_to_utf16(matched);
- auto match_length = utf16_matched.size();
-
- auto utf16_string = AK::utf8_to_utf16(str);
- Utf16View utf16_string_view { utf16_string };
- auto string_length = utf16_string_view.length_in_code_units();
-
- auto utf16_replace = AK::utf8_to_utf16(replace_string);
- Utf16View utf16_replace_view { utf16_replace };
- auto replace_length = utf16_replace_view.length_in_code_units();
+ Utf16View replace_view { replace_string };
StringBuilder result;
- for (size_t i = 0; i < replace_length; ++i) {
- u16 curr = utf16_replace_view.code_unit_at(i);
+ for (size_t i = 0; i < replace_view.length_in_code_units(); ++i) {
+ u16 curr = replace_view.code_unit_at(i);
- if ((curr != '$') || (i + 1 >= replace_length)) {
+ if ((curr != '$') || (i + 1 >= replace_view.length_in_code_units())) {
result.append(curr);
continue;
}
- u16 next = utf16_replace_view.code_unit_at(i + 1);
+ u16 next = replace_view.code_unit_at(i + 1);
if (next == '$') {
result.append('$');
++i;
} else if (next == '&') {
- result.append(matched);
+ result.append(matched.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
++i;
} else if (next == '`') {
- auto substring = utf16_string_view.substring_view(0, position);
+ auto substring = str.substring_view(0, position);
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
++i;
} else if (next == '\'') {
- auto tail_pos = position + match_length;
- if (tail_pos < string_length) {
- auto substring = utf16_string_view.substring_view(tail_pos);
+ auto tail_pos = position + matched.length_in_code_units();
+ if (tail_pos < str.length_in_code_units()) {
+ auto substring = str.substring_view(tail_pos);
result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
}
++i;
} else if (is_ascii_digit(next)) {
- bool is_two_digits = (i + 2 < replace_length) && is_ascii_digit(utf16_replace_view.code_unit_at(i + 2));
+ bool is_two_digits = (i + 2 < replace_view.length_in_code_units()) && is_ascii_digit(replace_view.code_unit_at(i + 2));
- auto capture_postition_string = utf16_replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
+ auto capture_postition_string = replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8();
auto capture_position = capture_postition_string.to_uint();
if (capture_position.has_value() && (*capture_position > 0) && (*capture_position <= captures.size())) {
@@ -650,8 +639,8 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
auto start_position = i + 2;
Optional<size_t> end_position;
- for (size_t j = start_position; j < replace_length; ++j) {
- if (utf16_replace_view.code_unit_at(j) == '>') {
+ for (size_t j = start_position; j < replace_view.length_in_code_units(); ++j) {
+ if (replace_view.code_unit_at(j) == '>') {
end_position = j;
break;
}
@@ -660,7 +649,7 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri
if (named_captures.is_undefined() || !end_position.has_value()) {
result.append(curr);
} else {
- auto group_name_view = utf16_replace_view.substring_view(start_position, *end_position - start_position);
+ auto group_name_view = replace_view.substring_view(start_position, *end_position - start_position);
auto group_name = group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
auto capture = named_captures.as_object().get(group_name);
diff --git a/Userland/Libraries/LibJS/Runtime/AbstractOperations.h b/Userland/Libraries/LibJS/Runtime/AbstractOperations.h
index 1be5683d3e..d1f52feb14 100644
--- a/Userland/Libraries/LibJS/Runtime/AbstractOperations.h
+++ b/Userland/Libraries/LibJS/Runtime/AbstractOperations.h
@@ -30,7 +30,7 @@ Object* get_prototype_from_constructor(GlobalObject&, FunctionObject const& cons
Object* create_unmapped_arguments_object(GlobalObject&, Vector<Value> const& arguments);
Object* create_mapped_arguments_object(GlobalObject&, FunctionObject&, Vector<FunctionNode::Parameter> const&, Vector<Value> const& arguments, Environment&);
Value canonical_numeric_index_string(GlobalObject&, PropertyName const&);
-String get_substitution(GlobalObject&, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
+String get_substitution(GlobalObject&, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement);
enum class CallerMode {
Strict,
diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp
index 245fcfe017..59a9b8d53c 100644
--- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp
+++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp
@@ -127,14 +127,6 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj
regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes);
}
-static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode)
-{
- auto utf16_string = AK::utf8_to_utf16(string);
- Utf16View utf16_string_view { utf16_string };
-
- return increment_last_index(global_object, regexp_object, utf16_string_view, unicode);
-}
-
// 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records
struct Match {
static Match create(regex::Match const& match)
@@ -619,9 +611,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
auto* regexp_object = this_object_from(vm, global_object);
if (!regexp_object)
return {};
- auto string = string_value.to_string(global_object);
+ auto string = string_value.to_utf16_string(global_object);
if (vm.exception())
return {};
+ Utf16View string_view { string };
if (!replace_value.is_function()) {
auto replace_string = replace_value.to_string(global_object);
@@ -654,7 +647,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
MarkedValueList results(vm.heap());
while (true) {
- auto result = regexp_exec(global_object, *regexp_object, string);
+ auto result = regexp_exec(global_object, *regexp_object, string_view);
if (vm.exception())
return {};
if (result.is_null())
@@ -676,7 +669,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
return {};
if (match_str.is_empty()) {
- increment_last_index(global_object, *regexp_object, string, unicode);
+ increment_last_index(global_object, *regexp_object, string_view, unicode);
if (vm.exception())
return {};
}
@@ -693,10 +686,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
auto matched_value = result.get(0);
if (vm.exception())
return {};
-
- auto matched = matched_value.to_string(global_object);
+ auto matched = matched_value.to_utf16_string(global_object);
if (vm.exception())
return {};
+ Utf16View matched_view { matched };
auto position_value = result.get(vm.names.index);
if (vm.exception())
@@ -706,7 +699,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
if (vm.exception())
return {};
- position = clamp(position, static_cast<double>(0), static_cast<double>(string.length()));
+ position = clamp(position, static_cast<double>(0), static_cast<double>(string_view.length_in_code_units()));
MarkedValueList captures(vm.heap());
for (size_t n = 1; n <= n_captures; ++n) {
@@ -735,10 +728,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
if (replace_value.is_function()) {
MarkedValueList replacer_args(vm.heap());
- replacer_args.append(js_string(vm, matched));
+ replacer_args.append(js_string(vm, matched_view));
replacer_args.extend(move(captures));
replacer_args.append(Value(position));
- replacer_args.append(js_string(vm, string));
+ replacer_args.append(js_string(vm, string_view));
if (!named_captures.is_undefined()) {
replacer_args.append(move(named_captures));
}
@@ -758,28 +751,32 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace)
return {};
}
- replacement = get_substitution(global_object, matched, string, position, captures, named_captures_object, replace_value);
+ replacement = get_substitution(global_object, matched_view, string_view, position, captures, named_captures_object, replace_value);
if (vm.exception())
return {};
}
if (position >= next_source_position) {
+ auto substring = string_view.substring_view(next_source_position, position - next_source_position);
+
StringBuilder builder;
builder.append(accumulated_result);
- builder.append(string.substring(next_source_position, position - next_source_position));
+ builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
builder.append(replacement);
accumulated_result = builder.build();
- next_source_position = position + matched.length();
+ next_source_position = position + matched_view.length_in_code_units();
}
}
- if (next_source_position >= string.length())
+ if (next_source_position >= string_view.length_in_code_units())
return js_string(vm, accumulated_result);
+ auto substring = string_view.substring_view(next_source_position);
+
StringBuilder builder;
builder.append(accumulated_result);
- builder.append(string.substring(next_source_position));
+ builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
return js_string(vm, builder.build());
}
diff --git a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp
index dda7cd91ed..eb9f28da53 100644
--- a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp
+++ b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp
@@ -910,10 +910,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
return {};
}
- auto string = this_object.to_string(global_object);
+ auto string = this_object.to_utf16_string(global_object);
if (vm.exception())
return {};
- auto search_string = search_value.to_string(global_object);
+ auto search_string = search_value.to_utf16_string(global_object);
if (vm.exception())
return {};
@@ -926,11 +926,8 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
return {};
}
- auto utf16_string = AK::utf8_to_utf16(string);
- Utf16View utf16_string_view { utf16_string };
-
- auto utf16_search_string = AK::utf8_to_utf16(search_string);
- Utf16View utf16_search_view { utf16_search_string };
+ Utf16View utf16_string_view { string };
+ Utf16View utf16_search_view { search_string };
Optional<size_t> position = string_index_of(utf16_string_view, utf16_search_view, 0);
if (!position.has_value())
@@ -948,7 +945,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace)
if (vm.exception())
return {};
} else {
- replacement = get_substitution(global_object, search_string, string, *position, {}, js_undefined(), replace_value);
+ replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, *position, {}, js_undefined(), replace_value);
if (vm.exception())
return {};
}
@@ -1004,10 +1001,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
}
}
- auto string = this_object.to_string(global_object);
+ auto string = this_object.to_utf16_string(global_object);
if (vm.exception())
return {};
- auto search_string = search_value.to_string(global_object);
+ auto search_string = search_value.to_utf16_string(global_object);
if (vm.exception())
return {};
@@ -1020,12 +1017,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
return {};
}
- auto utf16_string = AK::utf8_to_utf16(string);
- Utf16View utf16_string_view { utf16_string };
+ Utf16View utf16_string_view { string };
auto string_length = utf16_string_view.length_in_code_units();
- auto utf16_search_string = AK::utf8_to_utf16(search_string);
- Utf16View utf16_search_view { utf16_search_string };
+ Utf16View utf16_search_view { search_string };
auto search_length = utf16_search_view.length_in_code_units();
Vector<size_t> match_positions;
@@ -1053,7 +1048,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all)
if (vm.exception())
return {};
} else {
- replacement = get_substitution(global_object, search_string, string, position, {}, js_undefined(), replace_value);
+ replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, position, {}, js_undefined(), replace_value);
if (vm.exception())
return {};
}
diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js
index b61c0e77e2..b3984673d1 100644
--- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js
+++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js
@@ -238,7 +238,11 @@ test("UTF-16", () => {
expect("😀".replace("\ud83d", "")).toBe("\ude00");
expect("😀".replace("\ude00", "")).toBe("\ud83d");
- // FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16.
- // expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
- // expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
+ expect("😀".replace(/\ud83d/, "")).toBe("\ude00");
+ expect("😀".replace(/\ude00/, "")).toBe("\ud83d");
+ expect("😀".replace(/\ud83d\ude00/, "")).toBe("");
+
+ expect("😀".replace(/\ud83d/u, "")).toBe("😀");
+ expect("😀".replace(/\ude00/u, "")).toBe("😀");
+ expect("😀".replace(/\ud83d\ude00/u, "")).toBe("");
});
diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js
index 271bcc2bc9..67e8b94fca 100644
--- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js
+++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js
@@ -151,7 +151,18 @@ test("UTF-16", () => {
expect("😀😀😀".replaceAll("\ud83d", "")).toBe("\ude00\ude00\ude00");
expect("😀😀😀".replaceAll("\ude00", "")).toBe("\ud83d\ud83d\ud83d");
- // FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16.
- // expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
- // expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
+ expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00");
+ expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d");
+ expect("😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
+ expect("😀😀😀".replaceAll(/\ud83d/g, "")).toBe("\ude00\ude00\ude00");
+ expect("😀😀😀".replaceAll(/\ude00/g, "")).toBe("\ud83d\ud83d\ud83d");
+ expect("😀😀😀".replaceAll(/\ud83d\ude00/g, "")).toBe("");
+
+ expect("😀".replaceAll(/\ud83d/gu, "")).toBe("😀");
+ expect("😀".replaceAll(/\ude00/gu, "")).toBe("😀");
+ expect("😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
+ expect("😀😀😀".replaceAll(/\ud83d/gu, "")).toBe("😀😀😀");
+ expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
+ expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀");
+ expect("😀😀😀".replaceAll(/\ud83d\ude00/gu, "")).toBe("");
});