summaryrefslogtreecommitdiff
path: root/Userland/Libraries
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-07-20 10:46:53 -0400
committerAndreas Kling <kling@serenityos.org>2021-07-22 09:10:44 +0200
commit0c42aece362edfbd71f3b149601c065b5c675e80 (patch)
tree7813607db2d9eed60c3f0dc0d4fa277221a5d87d /Userland/Libraries
parent0e25d2393f2a7f49ded730d4a11643005ae9b468 (diff)
downloadserenity-0c42aece362edfbd71f3b149601c065b5c675e80.zip
LibJS: Transcode UTF-8 strings to UTF-16 and add UTF-16 accessors
LibJS parses JavaScript as UTF-8, so when creating a string, we must transcode it to UTF-16 to handle encoded surrogate pairs. For example, consider the following string: "\ud83d\ude00" The UTF-8 encoding of this surrogate pair is: 0xf0 0x9f 0x98 0x80 However, LibJS will currently store the two surrogates individually as UTF-8 encoded bytes, rather than combining the pair: 0xed 0xa0 0xb8, 0xed 0xb8 0x80 These are not equivalent. So, as String.prototype becomes UTF-16 aware, this encoding will no longer work for abstractions like strict equality.
Diffstat (limited to 'Userland/Libraries')
-rw-r--r--Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp52
-rw-r--r--Userland/Libraries/LibJS/Runtime/PrimitiveString.h10
-rw-r--r--Userland/Libraries/LibJS/Runtime/Value.cpp13
-rw-r--r--Userland/Libraries/LibJS/Runtime/Value.h1
4 files changed, 73 insertions, 3 deletions
diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
index 7a1f1e7580..b35982fafa 100644
--- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
+++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.cpp
@@ -4,6 +4,8 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
+#include <AK/CharacterTypes.h>
+#include <AK/Utf16View.h>
#include <LibJS/Runtime/PrimitiveString.h>
#include <LibJS/Runtime/VM.h>
@@ -18,13 +20,59 @@ PrimitiveString::~PrimitiveString()
{
}
+Vector<u16> const& PrimitiveString::utf16_string() const
+{
+ if (m_utf16_string.is_empty() && !m_string.is_empty())
+ m_utf16_string = AK::utf8_to_utf16(m_string);
+ return m_utf16_string;
+}
+
+Utf16View PrimitiveString::utf16_string_view() const
+{
+ return Utf16View { utf16_string() };
+}
+
+PrimitiveString* js_string(Heap& heap, Utf16View const& string)
+{
+ if (string.is_empty())
+ return &heap.vm().empty_string();
+
+ if (string.length_in_code_units() == 1) {
+ u16 code_unit = string.code_unit_at(0);
+ if (is_ascii(code_unit))
+ return &heap.vm().single_ascii_character_string(static_cast<u8>(code_unit));
+ }
+
+ auto utf8_string = string.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
+ return heap.allocate_without_global_object<PrimitiveString>(move(utf8_string));
+}
+
+PrimitiveString* js_string(VM& vm, Utf16View const& string)
+{
+ return js_string(vm.heap(), string);
+}
+
PrimitiveString* js_string(Heap& heap, String string)
{
if (string.is_empty())
return &heap.vm().empty_string();
- if (string.length() == 1 && (u8)string.characters()[0] < 0x80)
- return &heap.vm().single_ascii_character_string(string.characters()[0]);
+ if (string.length() == 1) {
+ auto ch = static_cast<u8>(string.characters()[0]);
+ if (is_ascii(ch))
+ return &heap.vm().single_ascii_character_string(ch);
+ }
+
+ // UTF-8 strings must first be transcoded to UTF-16, even though they are stored as String objects
+ // internally, to parse encoded surrogate pairs. As an optimization to reduce string copying, only
+ // perform that transcoding if there are non-ASCII codepoints in the string.
+ for (auto it : string) {
+ auto ch = static_cast<u8>(it);
+ if (!is_ascii(ch)) {
+ auto utf16_string = AK::utf8_to_utf16(string);
+ return js_string(heap, Utf16View { utf16_string });
+ }
+ }
return heap.allocate_without_global_object<PrimitiveString>(move(string));
}
diff --git a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h
index b9e24d7c5f..743ab79649 100644
--- a/Userland/Libraries/LibJS/Runtime/PrimitiveString.h
+++ b/Userland/Libraries/LibJS/Runtime/PrimitiveString.h
@@ -7,6 +7,7 @@
#pragma once
#include <AK/String.h>
+#include <AK/Vector.h>
#include <LibJS/Heap/Cell.h>
namespace JS {
@@ -16,14 +17,21 @@ public:
explicit PrimitiveString(String);
virtual ~PrimitiveString();
- const String& string() const { return m_string; }
+ String const& string() const { return m_string; }
+
+ Vector<u16> const& utf16_string() const;
+ Utf16View utf16_string_view() const;
private:
virtual const char* class_name() const override { return "PrimitiveString"; }
String m_string;
+ mutable Vector<u16> m_utf16_string;
};
+PrimitiveString* js_string(Heap&, Utf16View const&);
+PrimitiveString* js_string(VM&, Utf16View const&);
+
PrimitiveString* js_string(Heap&, String);
PrimitiveString* js_string(VM&, String);
diff --git a/Userland/Libraries/LibJS/Runtime/Value.cpp b/Userland/Libraries/LibJS/Runtime/Value.cpp
index 5666b28e71..cf54fa09fc 100644
--- a/Userland/Libraries/LibJS/Runtime/Value.cpp
+++ b/Userland/Libraries/LibJS/Runtime/Value.cpp
@@ -9,6 +9,7 @@
#include <AK/FlyString.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
+#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
#include <LibCrypto/BigInt/SignedBigInteger.h>
#include <LibCrypto/NumberTheory/ModularFunctions.h>
@@ -365,6 +366,18 @@ String Value::to_string(GlobalObject& global_object, bool legacy_null_to_empty_s
}
}
+Vector<u16> Value::to_utf16_string(GlobalObject& global_object) const
+{
+ if (m_type == Type::String)
+ return m_value.as_string->utf16_string();
+
+ auto utf8_string = to_string(global_object);
+ if (global_object.vm().exception())
+ return {};
+
+ return AK::utf8_to_utf16(utf8_string);
+}
+
// 7.1.2 ToBoolean ( argument ), https://tc39.es/ecma262/#sec-toboolean
bool Value::to_boolean() const
{
diff --git a/Userland/Libraries/LibJS/Runtime/Value.h b/Userland/Libraries/LibJS/Runtime/Value.h
index 8126660982..ee3a6fd74d 100644
--- a/Userland/Libraries/LibJS/Runtime/Value.h
+++ b/Userland/Libraries/LibJS/Runtime/Value.h
@@ -246,6 +246,7 @@ public:
u64 encoded() const { return m_value.encoded; }
String to_string(GlobalObject&, bool legacy_null_to_empty_string = false) const;
+ Vector<u16> to_utf16_string(GlobalObject&) const;
PrimitiveString* to_primitive_string(GlobalObject&);
Value to_primitive(GlobalObject&, PreferredType preferred_type = PreferredType::Default) const;
Object* to_object(GlobalObject&) const;