summaryrefslogtreecommitdiff
path: root/AK
diff options
context:
space:
mode:
authorTimothy Flynn <trflynn89@pm.me>2021-08-01 18:56:52 -0400
committerAndreas Kling <kling@serenityos.org>2021-08-04 11:18:24 +0200
commit510bbcd8e0b3cb4f7261018255715639a24c8f81 (patch)
treef6f45a776d8001b9a52ff250330f8cd028e98635 /AK
parentbed51d856a516af06be3bbbfd3b9592eb53fd848 (diff)
downloadserenity-510bbcd8e0b3cb4f7261018255715639a24c8f81.zip
AK+LibRegex: Add Utf16View::code_point_at and use it in RegexStringView
The current method of iterating through the string to access a code point hurts performance quite badly for very large strings. The test262 test "RegExp/property-escapes/generated/Any.js" previously took 3 hours to complete; this one change brings it down to under 10 seconds.
Diffstat (limited to 'AK')
-rw-r--r--AK/Utf16View.cpp17
-rw-r--r--AK/Utf16View.h1
2 files changed, 18 insertions, 0 deletions
diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp
index 44a2a89f73..5b7e2eb6ad 100644
--- a/AK/Utf16View.cpp
+++ b/AK/Utf16View.cpp
@@ -111,6 +111,23 @@ u16 Utf16View::code_unit_at(size_t index) const
return m_code_units[index];
}
+u32 Utf16View::code_point_at(size_t index) const
+{
+ VERIFY(index < length_in_code_units());
+
+ u32 code_point = code_unit_at(index);
+ if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point))
+ return code_point;
+ if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
+ return code_point;
+
+ auto second = code_unit_at(index + 1);
+ if (!is_low_surrogate(second))
+ return code_point;
+
+ return decode_surrogate_pair(code_point, second);
+}
+
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
{
size_t code_point_offset = 0;
diff --git a/AK/Utf16View.h b/AK/Utf16View.h
index 5f58c12036..054c9f4043 100644
--- a/AK/Utf16View.h
+++ b/AK/Utf16View.h
@@ -87,6 +87,7 @@ public:
u16 const* data() const { return m_code_units.data(); }
u16 code_unit_at(size_t index) const;
+ u32 code_point_at(size_t index) const;
size_t code_point_offset_of(size_t code_unit_offset) const;
size_t code_unit_offset_of(size_t code_point_offset) const;