summaryrefslogtreecommitdiff
path: root/script/encoder/utf16.lua
blob: 7b08e082c59e199a5814a549dd3fce7a6dad9d03 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
local error = error
local strchar = string.char
local strbyte = string.byte
local strmatch = string.match
local utf8char = utf8.char
local tconcat = table.concat

local function be_tochar(code)
    return strchar((code >> 8) & 0xFF, code & 0xFF)
end

local function be_tobyte(s, i)
    local h, l = strbyte(s, i, i+1)
    return (h << 8) | l
end

local function le_tochar(code)
    return strchar(code & 0xFF, (code >> 8) & 0xFF)
end

local function le_tobyte(s, i)
    local l, h = strbyte(s, i, i+1)
    return (h << 8) | l
end

local function utf16char(tochar, code)
    if code < 0x10000 then
        return tochar(code)
    else
        code = code - 0x10000
        return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
    end
end

local function utf16next(s, n, tobyte)
    if n > #s then
        return
    end
    local code1 = tobyte(s, n)
    if code1 < 0xD800 or code1 >= 0xE000 then
        return n+2, code1
    elseif code1 >= 0xD800 and code1 < 0xDC00 then
        n = n + 2
        if n > #s then
            return n --invaild
        end
        local code2 = tobyte(s, n)
        if code2 < 0xDC00 or code2 >= 0xE000 then
            return n --invaild
        end
        local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
        return n+2, code
    else
        return n+2 --invaild
    end
end

local function utf16codes(s, tobyte)
    return function (_, n)
        return utf16next(s, n, tobyte)
    end, s, 1
end

local _utf8byte = utf8.codes ""
local function utf8byte(s, n)
    local _, code = _utf8byte(s, n-1)
    return code
end

--[[
  U+0000..  U+007F 00..7F
  U+0080..  U+07FF C2..DF 80..BF
  U+0800..  U+0FFF E0     A0..BF 80..BF
  U+1000..  U+CFFF E1..EC 80..BF 80..BF
  U+D000..  U+D7FF ED     80..9F 80..BF
  U+E000..  U+FFFF EE..EF 80..BF 80..BF
 U+10000.. U+3FFFF F0     90..BF 80..BF 80..BF
 U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF
U+100000..U+10FFFF F4     80..8F 80..BF 80..BF
]]
local function utf8next(s, n)
    if n > #s then
        return
    end
    if strmatch(s, "^[\0-\x7F]", n) then
        return n+1, utf8byte(s, n)
    elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then
        return n+2, utf8byte(s, n)
    elseif strmatch(s, "^[\xE0][\xA0-\xBF][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xE1-\xEC][\x80-\xBF][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xED][\x80-\x9F][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xEE-\xEF][\x80-\xBF][\x80-\xBF]", n) then
        return n+3, utf8byte(s, n)
    elseif strmatch(s, "^[\xF0][\x90-\xBF][\x80-\xBF][\x80-\xBF]", n) then
        return n+4, utf8byte(s, n)
    elseif strmatch(s, "^[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then
        return n+4, utf8byte(s, n)
    elseif strmatch(s, "^[\xF4][\x80-\x8F][\x80-\xBF][\x80-\xBF]", n) then
        return n+4, utf8byte(s, n)
    else
        return n+1 --invaild
    end
end

local function utf8codes(s)
    return utf8next, s, 1
end

return function (what, replace)
    local tobyte, tochar
    if what == "be" then
        tobyte = be_tobyte
        tochar = be_tochar
    else
        tobyte = le_tobyte
        tochar = le_tochar
    end
    local utf8replace  = replace and utf8char(replace)
    local utf16replace = replace and utf16char(tochar, replace)
    local function toutf8(s)
        local r = {}
        for _, code in utf16codes(s, tobyte) do
            if code == nil then
                if replace then
                    r[#r+1] = utf8replace
                else
                    error "invalid UTF-16 code"
                end
            else
                r[#r+1] = utf8char(code)
            end
        end
        return tconcat(r)
    end
    local function fromutf8(s)
        local r = {}
        for _, code in utf8codes(s) do
            if code == nil then
                if replace then
                    r[#r+1] = utf16replace
                else
                    error "invalid UTF-8 code"
                end
            else
                r[#r+1] = utf16char(tochar, code)
            end
        end
        return tconcat(r)
    end
    return {
        toutf8 = toutf8,
        fromutf8 = fromutf8,
    }
end