1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
|
local error = error
local strchar = string.char
local strbyte = string.byte
local strmatch = string.match
local utf8char = utf8.char
local tconcat = table.concat
local function be_tochar(code)
return strchar((code >> 8) & 0xFF, code & 0xFF)
end
local function be_tobyte(s, i)
local h, l = strbyte(s, i, i+1)
return (h << 8) | l
end
local function le_tochar(code)
return strchar(code & 0xFF, (code >> 8) & 0xFF)
end
local function le_tobyte(s, i)
local l, h = strbyte(s, i, i+1)
return (h << 8) | l
end
local function utf16char(tochar, code)
if code < 0x10000 then
return tochar(code)
else
code = code - 0x10000
return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
end
end
local function utf16next(s, n, tobyte)
if n > #s then
return
end
local code1 = tobyte(s, n)
if code1 < 0xD800 or code1 >= 0xE000 then
return n+2, code1
elseif code1 >= 0xD800 and code1 < 0xDC00 then
n = n + 2
if n > #s then
return n --invaild
end
local code2 = tobyte(s, n)
if code2 < 0xDC00 or code2 >= 0xE000 then
return n --invaild
end
local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
return n+2, code
else
return n+2 --invaild
end
end
local function utf16codes(s, tobyte)
return function (_, n)
return utf16next(s, n, tobyte)
end, s, 1
end
local _utf8byte = utf8.codes ""
local function utf8byte(s, n)
local _, code = _utf8byte(s, n-1)
return code
end
--[[
U+0000.. U+007F 00..7F
U+0080.. U+07FF C2..DF 80..BF
U+0800.. U+0FFF E0 A0..BF 80..BF
U+1000.. U+CFFF E1..EC 80..BF 80..BF
U+D000.. U+D7FF ED 80..9F 80..BF
U+E000.. U+FFFF EE..EF 80..BF 80..BF
U+10000.. U+3FFFF F0 90..BF 80..BF 80..BF
U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
]]
local function utf8next(s, n)
if n > #s then
return
end
if strmatch(s, "^[\0-\x7F]", n) then
return n+1, utf8byte(s, n)
elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then
return n+2, utf8byte(s, n)
elseif strmatch(s, "^[\xE0][\xA0-\xBF][\x80-\xBF]", n) then
return n+3, utf8byte(s, n)
elseif strmatch(s, "^[\xE1-\xEC][\x80-\xBF][\x80-\xBF]", n) then
return n+3, utf8byte(s, n)
elseif strmatch(s, "^[\xED][\x80-\x9F][\x80-\xBF]", n) then
return n+3, utf8byte(s, n)
elseif strmatch(s, "^[\xEE-\xEF][\x80-\xBF][\x80-\xBF]", n) then
return n+3, utf8byte(s, n)
elseif strmatch(s, "^[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]", n) then
return n+4, utf8byte(s, n)
elseif strmatch(s, "^[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then
return n+4, utf8byte(s, n)
elseif strmatch(s, "^[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]", n) then
return n+4, utf8byte(s, n)
else
return n+1 --invaild
end
end
local function utf8codes(s)
return utf8next, s, 1
end
return function (what, replace)
local tobyte, tochar
if what == "be" then
tobyte = be_tobyte
tochar = be_tochar
else
tobyte = le_tobyte
tochar = le_tochar
end
local utf8replace = replace and utf8char(replace)
local utf16replace = replace and utf16char(tochar, replace)
local function toutf8(s)
local r = {}
for _, code in utf16codes(s, tobyte) do
if code == nil then
if replace then
r[#r+1] = utf8replace
else
error "invalid UTF-16 code"
end
else
r[#r+1] = utf8char(code)
end
end
return tconcat(r)
end
local function fromutf8(s)
local r = {}
for _, code in utf8codes(s) do
if code == nil then
if replace then
r[#r+1] = utf16replace
else
error "invalid UTF-8 code"
end
else
r[#r+1] = utf16char(tochar, code)
end
end
return tconcat(r)
end
return {
toutf8 = toutf8,
fromutf8 = fromutf8,
}
end
|