diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-07-22 10:38:10 -0400 |
---|---|---|
committer | Linus Groh <mail@linusgroh.de> | 2021-07-23 23:06:57 +0100 |
commit | 5a8f870594e3619317cf4cd584cbdaacd5ebcdef (patch) | |
tree | 230b7edd45e6127dc82859293a209b8d53057cf9 /Userland/Libraries | |
parent | ee7b04f7bb53ebc32d00b8804610769cdc3e8b05 (diff) | |
download | serenity-5a8f870594e3619317cf4cd584cbdaacd5ebcdef.zip |
LibJS: Implement RegExp.prototype [ @@replace ] with UTF-16 code units
This also converts the GetSubstitution abstract operation take its input
strings as UTF-16 now that all callers are UTF-16 capable. This means
String.prototype.replace (and replaceAll) no longer needs UTF-8 and
UTF-16 copies of these strings.
Diffstat (limited to 'Userland/Libraries')
6 files changed, 67 insertions, 71 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp b/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp index 80ec6fb56f..b792abd799 100644 --- a/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp +++ b/Userland/Libraries/LibJS/Runtime/AbstractOperations.cpp @@ -576,59 +576,48 @@ Value canonical_numeric_index_string(GlobalObject& global_object, PropertyName c } // 22.1.3.17.1 GetSubstitution ( matched, str, position, captures, namedCaptures, replacement ), https://tc39.es/ecma262/#sec-getsubstitution -String get_substitution(GlobalObject& global_object, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement) +String get_substitution(GlobalObject& global_object, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement) { auto& vm = global_object.vm(); - auto replace_string = replacement.to_string(global_object); + auto replace_string = replacement.to_utf16_string(global_object); if (vm.exception()) return {}; - - // FIXME: Once RegExp.prototype supports UTF-16, this AO can take UTF-16 strings as parameters instead of having to transcode here. - auto utf16_matched = AK::utf8_to_utf16(matched); - auto match_length = utf16_matched.size(); - - auto utf16_string = AK::utf8_to_utf16(str); - Utf16View utf16_string_view { utf16_string }; - auto string_length = utf16_string_view.length_in_code_units(); - - auto utf16_replace = AK::utf8_to_utf16(replace_string); - Utf16View utf16_replace_view { utf16_replace }; - auto replace_length = utf16_replace_view.length_in_code_units(); + Utf16View replace_view { replace_string }; StringBuilder result; - for (size_t i = 0; i < replace_length; ++i) { - u16 curr = utf16_replace_view.code_unit_at(i); + for (size_t i = 0; i < replace_view.length_in_code_units(); ++i) { + u16 curr = replace_view.code_unit_at(i); - if ((curr != '$') || (i + 1 >= replace_length)) { + if ((curr != '$') || (i + 1 >= replace_view.length_in_code_units())) { result.append(curr); continue; } - u16 next = utf16_replace_view.code_unit_at(i + 1); + u16 next = replace_view.code_unit_at(i + 1); if (next == '$') { result.append('$'); ++i; } else if (next == '&') { - result.append(matched); + result.append(matched.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); ++i; } else if (next == '`') { - auto substring = utf16_string_view.substring_view(0, position); + auto substring = str.substring_view(0, position); result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); ++i; } else if (next == '\'') { - auto tail_pos = position + match_length; - if (tail_pos < string_length) { - auto substring = utf16_string_view.substring_view(tail_pos); + auto tail_pos = position + matched.length_in_code_units(); + if (tail_pos < str.length_in_code_units()) { + auto substring = str.substring_view(tail_pos); result.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); } ++i; } else if (is_ascii_digit(next)) { - bool is_two_digits = (i + 2 < replace_length) && is_ascii_digit(utf16_replace_view.code_unit_at(i + 2)); + bool is_two_digits = (i + 2 < replace_view.length_in_code_units()) && is_ascii_digit(replace_view.code_unit_at(i + 2)); - auto capture_postition_string = utf16_replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8(); + auto capture_postition_string = replace_view.substring_view(i + 1, is_two_digits ? 2 : 1).to_utf8(); auto capture_position = capture_postition_string.to_uint(); if (capture_position.has_value() && (*capture_position > 0) && (*capture_position <= captures.size())) { @@ -650,8 +639,8 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri auto start_position = i + 2; Optional<size_t> end_position; - for (size_t j = start_position; j < replace_length; ++j) { - if (utf16_replace_view.code_unit_at(j) == '>') { + for (size_t j = start_position; j < replace_view.length_in_code_units(); ++j) { + if (replace_view.code_unit_at(j) == '>') { end_position = j; break; } @@ -660,7 +649,7 @@ String get_substitution(GlobalObject& global_object, String const& matched, Stri if (named_captures.is_undefined() || !end_position.has_value()) { result.append(curr); } else { - auto group_name_view = utf16_replace_view.substring_view(start_position, *end_position - start_position); + auto group_name_view = replace_view.substring_view(start_position, *end_position - start_position); auto group_name = group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); auto capture = named_captures.as_object().get(group_name); diff --git a/Userland/Libraries/LibJS/Runtime/AbstractOperations.h b/Userland/Libraries/LibJS/Runtime/AbstractOperations.h index 1be5683d3e..d1f52feb14 100644 --- a/Userland/Libraries/LibJS/Runtime/AbstractOperations.h +++ b/Userland/Libraries/LibJS/Runtime/AbstractOperations.h @@ -30,7 +30,7 @@ Object* get_prototype_from_constructor(GlobalObject&, FunctionObject const& cons Object* create_unmapped_arguments_object(GlobalObject&, Vector<Value> const& arguments); Object* create_mapped_arguments_object(GlobalObject&, FunctionObject&, Vector<FunctionNode::Parameter> const&, Vector<Value> const& arguments, Environment&); Value canonical_numeric_index_string(GlobalObject&, PropertyName const&); -String get_substitution(GlobalObject&, String const& matched, String const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement); +String get_substitution(GlobalObject&, Utf16View const& matched, Utf16View const& str, size_t position, Vector<Value> const& captures, Value named_captures, Value replacement); enum class CallerMode { Strict, diff --git a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp index 245fcfe017..59a9b8d53c 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpPrototype.cpp @@ -127,14 +127,6 @@ static void increment_last_index(GlobalObject& global_object, Object& regexp_obj regexp_object.set(vm.names.lastIndex, Value(last_index), Object::ShouldThrowExceptions::Yes); } -static void increment_last_index(GlobalObject& global_object, Object& regexp_object, String const& string, bool unicode) -{ - auto utf16_string = AK::utf8_to_utf16(string); - Utf16View utf16_string_view { utf16_string }; - - return increment_last_index(global_object, regexp_object, utf16_string_view, unicode); -} - // 1.1.2.1 Match Records, https://tc39.es/proposal-regexp-match-indices/#sec-match-records struct Match { static Match create(regex::Match const& match) @@ -619,9 +611,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) auto* regexp_object = this_object_from(vm, global_object); if (!regexp_object) return {}; - auto string = string_value.to_string(global_object); + auto string = string_value.to_utf16_string(global_object); if (vm.exception()) return {}; + Utf16View string_view { string }; if (!replace_value.is_function()) { auto replace_string = replace_value.to_string(global_object); @@ -654,7 +647,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) MarkedValueList results(vm.heap()); while (true) { - auto result = regexp_exec(global_object, *regexp_object, string); + auto result = regexp_exec(global_object, *regexp_object, string_view); if (vm.exception()) return {}; if (result.is_null()) @@ -676,7 +669,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) return {}; if (match_str.is_empty()) { - increment_last_index(global_object, *regexp_object, string, unicode); + increment_last_index(global_object, *regexp_object, string_view, unicode); if (vm.exception()) return {}; } @@ -693,10 +686,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) auto matched_value = result.get(0); if (vm.exception()) return {}; - - auto matched = matched_value.to_string(global_object); + auto matched = matched_value.to_utf16_string(global_object); if (vm.exception()) return {}; + Utf16View matched_view { matched }; auto position_value = result.get(vm.names.index); if (vm.exception()) @@ -706,7 +699,7 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) if (vm.exception()) return {}; - position = clamp(position, static_cast<double>(0), static_cast<double>(string.length())); + position = clamp(position, static_cast<double>(0), static_cast<double>(string_view.length_in_code_units())); MarkedValueList captures(vm.heap()); for (size_t n = 1; n <= n_captures; ++n) { @@ -735,10 +728,10 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) if (replace_value.is_function()) { MarkedValueList replacer_args(vm.heap()); - replacer_args.append(js_string(vm, matched)); + replacer_args.append(js_string(vm, matched_view)); replacer_args.extend(move(captures)); replacer_args.append(Value(position)); - replacer_args.append(js_string(vm, string)); + replacer_args.append(js_string(vm, string_view)); if (!named_captures.is_undefined()) { replacer_args.append(move(named_captures)); } @@ -758,28 +751,32 @@ JS_DEFINE_NATIVE_FUNCTION(RegExpPrototype::symbol_replace) return {}; } - replacement = get_substitution(global_object, matched, string, position, captures, named_captures_object, replace_value); + replacement = get_substitution(global_object, matched_view, string_view, position, captures, named_captures_object, replace_value); if (vm.exception()) return {}; } if (position >= next_source_position) { + auto substring = string_view.substring_view(next_source_position, position - next_source_position); + StringBuilder builder; builder.append(accumulated_result); - builder.append(string.substring(next_source_position, position - next_source_position)); + builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); builder.append(replacement); accumulated_result = builder.build(); - next_source_position = position + matched.length(); + next_source_position = position + matched_view.length_in_code_units(); } } - if (next_source_position >= string.length()) + if (next_source_position >= string_view.length_in_code_units()) return js_string(vm, accumulated_result); + auto substring = string_view.substring_view(next_source_position); + StringBuilder builder; builder.append(accumulated_result); - builder.append(string.substring(next_source_position)); + builder.append(substring.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)); return js_string(vm, builder.build()); } diff --git a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp index dda7cd91ed..eb9f28da53 100644 --- a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp @@ -910,10 +910,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace) return {}; } - auto string = this_object.to_string(global_object); + auto string = this_object.to_utf16_string(global_object); if (vm.exception()) return {}; - auto search_string = search_value.to_string(global_object); + auto search_string = search_value.to_utf16_string(global_object); if (vm.exception()) return {}; @@ -926,11 +926,8 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace) return {}; } - auto utf16_string = AK::utf8_to_utf16(string); - Utf16View utf16_string_view { utf16_string }; - - auto utf16_search_string = AK::utf8_to_utf16(search_string); - Utf16View utf16_search_view { utf16_search_string }; + Utf16View utf16_string_view { string }; + Utf16View utf16_search_view { search_string }; Optional<size_t> position = string_index_of(utf16_string_view, utf16_search_view, 0); if (!position.has_value()) @@ -948,7 +945,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace) if (vm.exception()) return {}; } else { - replacement = get_substitution(global_object, search_string, string, *position, {}, js_undefined(), replace_value); + replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, *position, {}, js_undefined(), replace_value); if (vm.exception()) return {}; } @@ -1004,10 +1001,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all) } } - auto string = this_object.to_string(global_object); + auto string = this_object.to_utf16_string(global_object); if (vm.exception()) return {}; - auto search_string = search_value.to_string(global_object); + auto search_string = search_value.to_utf16_string(global_object); if (vm.exception()) return {}; @@ -1020,12 +1017,10 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all) return {}; } - auto utf16_string = AK::utf8_to_utf16(string); - Utf16View utf16_string_view { utf16_string }; + Utf16View utf16_string_view { string }; auto string_length = utf16_string_view.length_in_code_units(); - auto utf16_search_string = AK::utf8_to_utf16(search_string); - Utf16View utf16_search_view { utf16_search_string }; + Utf16View utf16_search_view { search_string }; auto search_length = utf16_search_view.length_in_code_units(); Vector<size_t> match_positions; @@ -1053,7 +1048,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::replace_all) if (vm.exception()) return {}; } else { - replacement = get_substitution(global_object, search_string, string, position, {}, js_undefined(), replace_value); + replacement = get_substitution(global_object, utf16_search_view, utf16_string_view, position, {}, js_undefined(), replace_value); if (vm.exception()) return {}; } diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js index b61c0e77e2..b3984673d1 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js +++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replace.js @@ -238,7 +238,11 @@ test("UTF-16", () => { expect("😀".replace("\ud83d", "")).toBe("\ude00"); expect("😀".replace("\ude00", "")).toBe("\ud83d"); - // FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16. - // expect("😀".replace(/\ud83d/, "")).toBe("\ude00"); - // expect("😀".replace(/\ude00/, "")).toBe("\ud83d"); + expect("😀".replace(/\ud83d/, "")).toBe("\ude00"); + expect("😀".replace(/\ude00/, "")).toBe("\ud83d"); + expect("😀".replace(/\ud83d\ude00/, "")).toBe(""); + + expect("😀".replace(/\ud83d/u, "")).toBe("😀"); + expect("😀".replace(/\ude00/u, "")).toBe("😀"); + expect("😀".replace(/\ud83d\ude00/u, "")).toBe(""); }); diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js index 271bcc2bc9..67e8b94fca 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js +++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.replaceAll.js @@ -151,7 +151,18 @@ test("UTF-16", () => { expect("😀😀😀".replaceAll("\ud83d", "")).toBe("\ude00\ude00\ude00"); expect("😀😀😀".replaceAll("\ude00", "")).toBe("\ud83d\ud83d\ud83d"); - // FIXME: RegExp.prototype [ @@replace ] also needs to support UTF-16. - // expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00"); - // expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d"); + expect("😀".replaceAll(/\ud83d/g, "")).toBe("\ude00"); + expect("😀".replaceAll(/\ude00/g, "")).toBe("\ud83d"); + expect("😀".replaceAll(/\ud83d\ude00/g, "")).toBe(""); + expect("😀😀😀".replaceAll(/\ud83d/g, "")).toBe("\ude00\ude00\ude00"); + expect("😀😀😀".replaceAll(/\ude00/g, "")).toBe("\ud83d\ud83d\ud83d"); + expect("😀😀😀".replaceAll(/\ud83d\ude00/g, "")).toBe(""); + + expect("😀".replaceAll(/\ud83d/gu, "")).toBe("😀"); + expect("😀".replaceAll(/\ude00/gu, "")).toBe("😀"); + expect("😀".replaceAll(/\ud83d\ude00/gu, "")).toBe(""); + expect("😀😀😀".replaceAll(/\ud83d/gu, "")).toBe("😀😀😀"); + expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀"); + expect("😀😀😀".replaceAll(/\ude00/gu, "")).toBe("😀😀😀"); + expect("😀😀😀".replaceAll(/\ud83d\ude00/gu, "")).toBe(""); }); |