1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
|
/*
* Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
namespace AK {
static constexpr u16 high_surrogate_min = 0xd800;
static constexpr u16 high_surrogate_max = 0xdbff;
static constexpr u16 low_surrogate_min = 0xdc00;
static constexpr u16 low_surrogate_max = 0xdfff;
static constexpr u32 replacement_code_point = 0xfffd;
static constexpr u32 first_supplementary_plane_code_point = 0x10000;
Vector<u16> utf8_to_utf16(StringView const& utf8_view)
{
return utf8_to_utf16(Utf8View { utf8_view });
}
Vector<u16> utf8_to_utf16(Utf8View const& utf8_view)
{
Vector<u16> utf16_data;
for (auto code_point : utf8_view) {
if (code_point < first_supplementary_plane_code_point) {
utf16_data.append(static_cast<u16>(code_point));
} else {
code_point -= first_supplementary_plane_code_point;
utf16_data.append(static_cast<u16>(high_surrogate_min | (code_point >> 10)));
utf16_data.append(static_cast<u16>(low_surrogate_min | (code_point & 0x3ff)));
}
}
return utf16_data;
}
bool Utf16View::is_high_surrogate(u16 code_unit)
{
return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max);
}
bool Utf16View::is_low_surrogate(u16 code_unit)
{
return (code_unit >= low_surrogate_min) && (code_unit <= low_surrogate_max);
}
u32 Utf16View::decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate)
{
VERIFY(is_high_surrogate(high_surrogate));
VERIFY(is_low_surrogate(low_surrogate));
return ((high_surrogate - high_surrogate_min) << 10) + (low_surrogate - low_surrogate_min) + first_supplementary_plane_code_point;
}
String Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
{
StringBuilder builder;
if (allow_invalid_code_units == AllowInvalidCodeUnits::Yes) {
for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
if (is_high_surrogate(*ptr)) {
auto const* next = ptr + 1;
if ((next < end_ptr()) && is_low_surrogate(*next)) {
auto code_point = decode_surrogate_pair(*ptr, *next);
builder.append_code_point(code_point);
++ptr;
continue;
}
}
builder.append_code_point(static_cast<u32>(*ptr));
}
} else {
for (auto code_point : *this)
builder.append_code_point(code_point);
}
return builder.build();
}
size_t Utf16View::length_in_code_points() const
{
if (!m_length_in_code_points.has_value())
m_length_in_code_points = calculate_length_in_code_points();
return *m_length_in_code_points;
}
u16 Utf16View::code_unit_at(size_t index) const
{
VERIFY(index < length_in_code_units());
return m_code_units[index];
}
Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const
{
VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length));
VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) };
}
bool Utf16View::validate(size_t& valid_code_units) const
{
valid_code_units = 0;
for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) {
if (is_high_surrogate(*ptr)) {
if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr))
return false;
++valid_code_units;
} else if (is_low_surrogate(*ptr)) {
return false;
}
++valid_code_units;
}
return true;
}
size_t Utf16View::calculate_length_in_code_points() const
{
size_t code_points = 0;
for ([[maybe_unused]] auto code_point : *this)
++code_points;
return code_points;
}
bool Utf16View::operator==(Utf16View const& other) const
{
if (length_in_code_units() == 0)
return other.length_in_code_units() == 0;
if (length_in_code_units() != other.length_in_code_units())
return false;
for (size_t i = 0; i < length_in_code_units(); ++i) {
if (m_code_units[i] != other.m_code_units[i])
return false;
}
return true;
}
Utf16CodePointIterator& Utf16CodePointIterator::operator++()
{
size_t code_units = length_in_code_units();
if (code_units > m_remaining_code_units) {
// If there aren't enough code units remaining, skip to the end.
m_ptr += m_remaining_code_units;
m_remaining_code_units = 0;
} else {
m_ptr += code_units;
m_remaining_code_units -= code_units;
}
return *this;
}
u32 Utf16CodePointIterator::operator*() const
{
VERIFY(m_remaining_code_units > 0);
if (Utf16View::is_high_surrogate(*m_ptr)) {
if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1)))
return Utf16View::decode_surrogate_pair(*m_ptr, *(m_ptr + 1));
return replacement_code_point;
} else if (Utf16View::is_low_surrogate(*m_ptr)) {
return replacement_code_point;
}
return static_cast<u32>(*m_ptr);
}
size_t Utf16CodePointIterator::length_in_code_units() const
{
VERIFY(m_remaining_code_units > 0);
if (Utf16View::is_high_surrogate(*m_ptr)) {
if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1)))
return 2;
}
// If this return is reached, either the encoded code point is a valid single code unit, or that
// code point is invalid (e.g. began with a low surrogate, or a low surrogate did not follow a
// high surrogate). In the latter case, a single replacement code unit will be used.
return 1;
}
}
|