1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
|
/*
* Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Format.h>
#include <AK/Forward.h>
#include <AK/Optional.h>
#include <AK/Span.h>
#include <AK/String.h>
#include <AK/Types.h>
#include <AK/Vector.h>
namespace AK {
Vector<u16, 1> utf8_to_utf16(StringView);
Vector<u16, 1> utf8_to_utf16(Utf8View const&);
Vector<u16, 1> utf32_to_utf16(Utf32View const&);
void code_point_to_utf16(Vector<u16, 1>&, u32);
class Utf16View;
class Utf16CodePointIterator {
friend class Utf16View;
public:
Utf16CodePointIterator() = default;
~Utf16CodePointIterator() = default;
bool operator==(Utf16CodePointIterator const& other) const
{
return (m_ptr == other.m_ptr) && (m_remaining_code_units == other.m_remaining_code_units);
}
bool operator!=(Utf16CodePointIterator const& other) const
{
return !(*this == other);
}
Utf16CodePointIterator& operator++();
u32 operator*() const;
size_t length_in_code_units() const;
private:
Utf16CodePointIterator(u16 const* ptr, size_t length)
: m_ptr(ptr)
, m_remaining_code_units(length)
{
}
u16 const* m_ptr { nullptr };
size_t m_remaining_code_units { 0 };
};
class Utf16View {
public:
static bool is_high_surrogate(u16);
static bool is_low_surrogate(u16);
static u32 decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate);
Utf16View() = default;
~Utf16View() = default;
explicit Utf16View(Span<u16 const> code_units)
: m_code_units(code_units)
{
}
bool operator==(Utf16View const& other) const { return m_code_units == other.m_code_units; }
enum class AllowInvalidCodeUnits {
Yes,
No,
};
String to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
bool is_null() const { return m_code_units.is_null(); }
bool is_empty() const { return m_code_units.is_empty(); }
size_t length_in_code_units() const { return m_code_units.size(); }
size_t length_in_code_points() const;
Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; }
Utf16CodePointIterator end() const { return { end_ptr(), 0 }; }
u16 const* data() const { return m_code_units.data(); }
u16 code_unit_at(size_t index) const;
u32 code_point_at(size_t index) const;
size_t code_point_offset_of(size_t code_unit_offset) const;
size_t code_unit_offset_of(size_t code_point_offset) const;
size_t code_unit_offset_of(Utf16CodePointIterator const&) const;
Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const;
Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); }
Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const;
Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); }
bool validate(size_t& valid_code_units) const;
bool validate() const
{
size_t valid_code_units;
return validate(valid_code_units);
}
bool equals_ignoring_case(Utf16View const&) const;
private:
u16 const* begin_ptr() const { return m_code_units.data(); }
u16 const* end_ptr() const { return begin_ptr() + m_code_units.size(); }
size_t calculate_length_in_code_points() const;
Span<u16 const> m_code_units;
mutable Optional<size_t> m_length_in_code_points;
};
}
template<>
struct AK::Formatter<AK::Utf16View> : Formatter<FormatString> {
ErrorOr<void> format(FormatBuilder& builder, AK::Utf16View const& value)
{
return builder.builder().try_append(value);
}
};
using AK::Utf16View;
|