diff options
Diffstat (limited to 'script/encoder')
-rw-r--r-- | script/encoder/ansi.lua | 4 | ||||
-rw-r--r-- | script/encoder/init.lua | 29 | ||||
-rw-r--r-- | script/encoder/utf16.lua | 147 | ||||
-rw-r--r-- | script/encoder/utf16be.lua | 46 | ||||
-rw-r--r-- | script/encoder/utf16le.lua | 46 |
5 files changed, 164 insertions, 108 deletions
diff --git a/script/encoder/ansi.lua b/script/encoder/ansi.lua index 1016e668..f5273c51 100644 --- a/script/encoder/ansi.lua +++ b/script/encoder/ansi.lua @@ -7,14 +7,14 @@ end local m = {} -function m.decode(text) +function m.toutf8(text) if not unicode then return text end return unicode.a2u(text) end -function m.encode(text) +function m.fromutf8(text) if not unicode then return text end diff --git a/script/encoder/init.lua b/script/encoder/init.lua index d7753c1f..0011265a 100644 --- a/script/encoder/init.lua +++ b/script/encoder/init.lua @@ -1,6 +1,7 @@ local ansi = require 'encoder.ansi' -local utf16le = require 'encoder.utf16le' -local utf16be = require 'encoder.utf16be' +local utf16 = require 'encoder.utf16' +local utf16le = utf16('le', utf8.codepoint '�') +local utf16be = utf16('be', utf8.codepoint '�') ---@alias encoder.encoding '"utf8"'|'"utf16"'|'"utf16le"'|'"utf16be"' @@ -17,11 +18,11 @@ function m.len(encoding, s, i, j) j = j or #s if encoding == 'utf16' or encoding == 'utf16' then - local us = utf16le.encode(s:sub(i, j)) + local us = utf16le.fromutf8(s:sub(i, j)) return #us // 2 end if encoding == 'utf16be' then - local us = utf16be.encode(s:sub(i, j)) + local us = utf16be.fromutf8(s:sub(i, j)) return #us // 2 end if encoding == 'utf8' then @@ -43,8 +44,8 @@ function m.offset(encoding, s, n, i) if not line:find '[\x80-\xff]' then return n + i - 1 end - local us = utf16le.encode(line) - local os = utf16le.decode(us:sub(1, n * 2 - 2)) + local us = utf16le.fromutf8(line) + local os = utf16le.toutf8(us:sub(1, n * 2 - 2)) return #os + i end if encoding == 'utf16be' then @@ -52,8 +53,8 @@ function m.offset(encoding, s, n, i) if not line:find '[\x80-\xff]' then return n + i - 1 end - local us = utf16be.encode(line) - local os = utf16be.decode(us:sub(1, n * 2 - 2)) + local us = utf16be.fromutf8(line) + local os = utf16be.toutf8(us:sub(1, n * 2 - 2)) return #os + i end if encoding == 'utf8' then @@ -75,11 +76,11 @@ function m.encode(encoding, text, bom) return text end if encoding == 'ansi' then - return ansi.encode(text) + return ansi.fromutf8(text) end if encoding == 'utf16' or encoding == 'utf16le' then - text = utf16le.encode(text) + text = utf16le.fromutf8(text) if bom == 'yes' or bom == 'auto' then text = '\xFF\xFE' .. text @@ -87,7 +88,7 @@ function m.encode(encoding, text, bom) return text end if encoding == 'utf16be' then - text = utf16be.encode(text) + text = utf16be.fromutf8(text) if bom == 'yes' or bom == 'auto' then text = '\xFE\xFF' .. text @@ -106,14 +107,14 @@ function m.decode(encoding, text) return text end if encoding == 'ansi' then - return ansi.decode(text) + return ansi.toutf8(text) end if encoding == 'utf16' or encoding == 'utf16le' then - return utf16le.decode(text) + return utf16le.toutf8(text) end if encoding == 'utf16be' then - return utf16be.decode(text) + return utf16be.toutf8(text) end log.error('Unsupport encode encoding:', encoding) return text diff --git a/script/encoder/utf16.lua b/script/encoder/utf16.lua new file mode 100644 index 00000000..9e71de68 --- /dev/null +++ b/script/encoder/utf16.lua @@ -0,0 +1,147 @@ +local error = error +local strchar = string.char +local strbyte = string.byte +local strmatch = string.match +local utf8char = utf8.char +local tconcat = table.concat + +local function be_tochar(code) + return strchar((code >> 8) & 0xFF, code & 0xFF) +end + +local function be_tobyte(s, i) + local h, l = strbyte(s, i, i+1) + return (h << 8) | l +end + +local function le_tochar(code) + return strchar(code & 0xFF, (code >> 8) & 0xFF) +end + +local function le_tobyte(s, i) + local l, h = strbyte(s, i, i+1) + return (h << 8) | l +end + +local function utf16char(tochar, code) + if code < 0x10000 then + return tochar(code) + else + code = code - 0x10000 + return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF)) + end +end + +local function utf16next(s, n, tobyte) + if n > #s then + return + end + local code1 = tobyte(s, n) + if code1 < 0xD800 or code1 >= 0xE000 then + return n+2, code1 + elseif code1 >= 0xD800 and code1 < 0xDC00 then + n = n + 2 + if n > #s then + return n --invaild + end + local code2 = tobyte(s, n) + if code2 < 0xDC00 or code2 >= 0xE000 then + return n --invaild + end + local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF) + return n+2, code + else + return n+2 --invaild + end +end + +local function utf16codes(s, tobyte) + return function (_, n) + return utf16next(s, n, tobyte) + end, s, 1 +end + +local _utf8byte = utf8.codes "" +local function utf8byte(s, n) + local _, code = _utf8byte(s, n-1) + return code +end + +--[[ + U+0000.. U+007F 00..7F + U+0080.. U+07FF C2..DF 80..BF + U+0800.. U+0FFF E0 A0..BF 80..BF + U+1000.. U+CFFF E1..EC 80..BF 80..BF + U+D000.. U+D7FF ED 80..9F 80..BF + U+E000.. U+FFFF EE..EF 80..BF 80..BF + U+10000.. U+3FFFF F0 90..BF 80..BF 80..BF + U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF +U+100000..U+10FFFF F4 80..8F 80..BF 80..BF +]] +local function utf8next(s, n) + if n > #s then + return + end + if strmatch(s, "^[\0-\x7F]", n) then + return n+1, utf8byte(s, n) + elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then + return n+2, utf8byte(s, n) + elseif strmatch(s, "^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]", n) then + return n+3, utf8byte(s, n) + elseif strmatch(s, "^[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then + return n+4, utf8byte(s, n) + else + return n+1 --invaild + end +end + +local function utf8codes(s) + return utf8next, s, 1 +end + +return function (what, replace) + local tobyte, tochar + if what == "be" then + tobyte = be_tobyte + tochar = be_tochar + else + tobyte = le_tobyte + tochar = le_tochar + end + local utf8replace = replace and utf8char(replace) + local utf16replace = replace and utf16char(tochar, replace) + local function toutf8(s) + local r = {} + for _, code in utf16codes(s, tobyte) do + if code == nil then + if replace then + r[#r+1] = utf8replace + else + error "invalid UTF-16 code" + end + else + r[#r+1] = utf8char(code) + end + end + return tconcat(r) + end + local function fromutf8(s) + local r = {} + for _, code in utf8codes(s) do + if code == nil then + if replace then + r[#r+1] = utf16replace + else + error "invalid UTF-8 code" + end + else + r[#r+1] = utf16char(tochar, code) + end + end + return tconcat(r) + end + return { + toutf8 = toutf8, + fromutf8 = fromutf8, + } +end diff --git a/script/encoder/utf16be.lua b/script/encoder/utf16be.lua deleted file mode 100644 index 5fc19b2c..00000000 --- a/script/encoder/utf16be.lua +++ /dev/null @@ -1,46 +0,0 @@ -local function tochar(code) - return string.char((code >> 8) & 0xFF, code & 0xFF) -end - -local function tobyte(s, i) - local h, l = string.byte(s, i, i+1) - return (h << 8) | l -end - -local function char(code) - if code <= 0xFFFF then - return tochar(code) - end - code = code - 0x10000 - return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF)) -end - -local m = {} - -function m.encode(s) - local r = {} - for _, c in utf8.codes(s, true) do - r[#r+1] = char(c) - end - return table.concat(r) -end - -function m.decode(s) - local r = {} - local i = 1 - while i < #s do - local code1 = tobyte(s, i) - if code1 >= 0xD800 and code1 < 0xE000 then - i = i + 2 - local code2 = tobyte(s, i) - local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF) - r[#r+1] = utf8.char(code) - else - r[#r+1] = utf8.char(code1) - end - i = i + 2 - end - return table.concat(r) -end - -return m diff --git a/script/encoder/utf16le.lua b/script/encoder/utf16le.lua deleted file mode 100644 index d51b4cfb..00000000 --- a/script/encoder/utf16le.lua +++ /dev/null @@ -1,46 +0,0 @@ -local function tochar(code) - return string.char(code & 0xFF, (code >> 8) & 0xFF) -end - -local function tobyte(s, i) - local l, h = string.byte(s, i, i+1) - return (h << 8) | l -end - -local function char(code) - if code <= 0xFFFF then - return tochar(code) - end - code = code - 0x10000 - return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF)) -end - -local m = {} - -function m.encode(s) - local r = {} - for _, c in utf8.codes(s, true) do - r[#r+1] = char(c) - end - return table.concat(r) -end - -function m.decode(s) - local r = {} - local i = 1 - while i < #s do - local code1 = tobyte(s, i) - if code1 >= 0xD800 and code1 < 0xE000 then - i = i + 2 - local code2 = tobyte(s, i) - local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF) - r[#r+1] = utf8.char(code) - else - r[#r+1] = utf8.char(code1) - end - i = i + 2 - end - return table.concat(r) -end - -return m |