diff options
author | Timothy Flynn <trflynn89@pm.me> | 2021-08-01 18:56:52 -0400 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-08-04 11:18:24 +0200 |
commit | 510bbcd8e0b3cb4f7261018255715639a24c8f81 (patch) | |
tree | f6f45a776d8001b9a52ff250330f8cd028e98635 /AK | |
parent | bed51d856a516af06be3bbbfd3b9592eb53fd848 (diff) | |
download | serenity-510bbcd8e0b3cb4f7261018255715639a24c8f81.zip |
AK+LibRegex: Add Utf16View::code_point_at and use it in RegexStringView
The current method of iterating through the string to access a code
point hurts performance quite badly for very large strings. The test262
test "RegExp/property-escapes/generated/Any.js" previously took 3 hours
to complete; this one change brings it down to under 10 seconds.
Diffstat (limited to 'AK')
-rw-r--r-- | AK/Utf16View.cpp | 17 | ||||
-rw-r--r-- | AK/Utf16View.h | 1 |
2 files changed, 18 insertions, 0 deletions
diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 44a2a89f73..5b7e2eb6ad 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -111,6 +111,23 @@ u16 Utf16View::code_unit_at(size_t index) const return m_code_units[index]; } +u32 Utf16View::code_point_at(size_t index) const +{ + VERIFY(index < length_in_code_units()); + + u32 code_point = code_unit_at(index); + if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point)) + return code_point; + if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units())) + return code_point; + + auto second = code_unit_at(index + 1); + if (!is_low_surrogate(second)) + return code_point; + + return decode_surrogate_pair(code_point, second); +} + size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const { size_t code_point_offset = 0; diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 5f58c12036..054c9f4043 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -87,6 +87,7 @@ public: u16 const* data() const { return m_code_units.data(); } u16 code_unit_at(size_t index) const; + u32 code_point_at(size_t index) const; size_t code_point_offset_of(size_t code_unit_offset) const; size_t code_unit_offset_of(size_t code_point_offset) const; |