5 files changed, 164 insertions, 108 deletions
diff --git a/script/encoder/ansi.lua b/script/encoder/ansi.lua
index 1016e668..f5273c51 100644
--- a/script/encoder/ansi.lua
+++ b/script/encoder/ansi.lua
@@ -7,14 +7,14 @@ end
 
 local m = {}
 
-function m.decode(text)
+function m.toutf8(text)
     if not unicode then
         return text
     end
     return unicode.a2u(text)
 end
 
-function m.encode(text)
+function m.fromutf8(text)
     if not unicode then
         return text
     end
diff --git a/script/encoder/init.lua b/script/encoder/init.lua
index d7753c1f..0011265a 100644
--- a/script/encoder/init.lua
+++ b/script/encoder/init.lua
@@ -1,6 +1,7 @@
 local ansi    = require 'encoder.ansi'
-local utf16le = require 'encoder.utf16le'
-local utf16be = require 'encoder.utf16be'
+local utf16   = require 'encoder.utf16'
+local utf16le = utf16('le', utf8.codepoint '�')
+local utf16be = utf16('be', utf8.codepoint '�')
 
 ---@alias encoder.encoding '"utf8"'|'"utf16"'|'"utf16le"'|'"utf16be"'
 
@@ -17,11 +18,11 @@ function m.len(encoding, s, i, j)
     j = j or #s
     if encoding == 'utf16'
     or encoding == 'utf16' then
-        local us = utf16le.encode(s:sub(i, j))
+        local us = utf16le.fromutf8(s:sub(i, j))
         return #us // 2
     end
     if encoding == 'utf16be' then
-        local us = utf16be.encode(s:sub(i, j))
+        local us = utf16be.fromutf8(s:sub(i, j))
         return #us // 2
     end
     if encoding == 'utf8' then
@@ -43,8 +44,8 @@ function m.offset(encoding, s, n, i)
         if not line:find '[\x80-\xff]' then
             return n + i - 1
         end
-        local us = utf16le.encode(line)
-        local os = utf16le.decode(us:sub(1, n * 2 - 2))
+        local us = utf16le.fromutf8(line)
+        local os = utf16le.toutf8(us:sub(1, n * 2 - 2))
         return #os + i
     end
     if encoding == 'utf16be' then
@@ -52,8 +53,8 @@ function m.offset(encoding, s, n, i)
         if not line:find '[\x80-\xff]' then
             return n + i - 1
         end
-        local us = utf16be.encode(line)
-        local os = utf16be.decode(us:sub(1, n * 2 - 2))
+        local us = utf16be.fromutf8(line)
+        local os = utf16be.toutf8(us:sub(1, n * 2 - 2))
         return #os + i
     end
     if encoding == 'utf8' then
@@ -75,11 +76,11 @@ function m.encode(encoding, text, bom)
         return text
     end
     if encoding == 'ansi' then
-        return ansi.encode(text)
+        return ansi.fromutf8(text)
     end
     if encoding == 'utf16'
     or encoding == 'utf16le' then
-        text = utf16le.encode(text)
+        text = utf16le.fromutf8(text)
         if bom == 'yes'
         or bom == 'auto' then
             text = '\xFF\xFE' .. text
@@ -87,7 +88,7 @@ function m.encode(encoding, text, bom)
         return text
     end
     if encoding == 'utf16be' then
-        text = utf16be.encode(text)
+        text = utf16be.fromutf8(text)
         if bom == 'yes'
         or bom == 'auto' then
             text = '\xFE\xFF' .. text
@@ -106,14 +107,14 @@ function m.decode(encoding, text)
         return text
     end
     if encoding == 'ansi' then
-        return ansi.decode(text)
+        return ansi.toutf8(text)
     end
     if encoding == 'utf16'
     or encoding == 'utf16le' then
-        return utf16le.decode(text)
+        return utf16le.toutf8(text)
     end
     if encoding == 'utf16be' then
-        return utf16be.decode(text)
+        return utf16be.toutf8(text)
     end
     log.error('Unsupport encode encoding:', encoding)
     return text
diff --git a/script/encoder/utf16.lua b/script/encoder/utf16.lua
new file mode 100644
index 00000000..9e71de68
--- /dev/null
+++ b/script/encoder/utf16.lua
@@ -0,0 +1,147 @@
+local error = error
+local strchar = string.char
+local strbyte = string.byte
+local strmatch = string.match
+local utf8char = utf8.char
+local tconcat = table.concat
+
+local function be_tochar(code)
+    return strchar((code >> 8) & 0xFF, code & 0xFF)
+end
+
+local function be_tobyte(s, i)
+    local h, l = strbyte(s, i, i+1)
+    return (h << 8) | l
+end
+
+local function le_tochar(code)
+    return strchar(code & 0xFF, (code >> 8) & 0xFF)
+end
+
+local function le_tobyte(s, i)
+    local l, h = strbyte(s, i, i+1)
+    return (h << 8) | l
+end
+
+local function utf16char(tochar, code)
+    if code < 0x10000 then
+        return tochar(code)
+    else
+        code = code - 0x10000
+        return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
+    end
+end
+
+local function utf16next(s, n, tobyte)
+    if n > #s then
+        return
+    end
+    local code1 = tobyte(s, n)
+    if code1 < 0xD800 or code1 >= 0xE000 then
+        return n+2, code1
+    elseif code1 >= 0xD800 and code1 < 0xDC00 then
+        n = n + 2
+        if n > #s then
+            return n --invaild
+        end
+        local code2 = tobyte(s, n)
+        if code2 < 0xDC00 or code2 >= 0xE000 then
+            return n --invaild
+        end
+        local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
+        return n+2, code
+    else
+        return n+2 --invaild
+    end
+end
+
+local function utf16codes(s, tobyte)
+    return function (_, n)
+        return utf16next(s, n, tobyte)
+    end, s, 1
+end
+
+local _utf8byte = utf8.codes ""
+local function utf8byte(s, n)
+    local _, code = _utf8byte(s, n-1)
+    return code
+end
+
+--[[
+  U+0000..  U+007F 00..7F
+  U+0080..  U+07FF C2..DF 80..BF
+  U+0800..  U+0FFF E0     A0..BF 80..BF
+  U+1000..  U+CFFF E1..EC 80..BF 80..BF
+  U+D000..  U+D7FF ED     80..9F 80..BF
+  U+E000..  U+FFFF EE..EF 80..BF 80..BF
+ U+10000.. U+3FFFF F0     90..BF 80..BF 80..BF
+ U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF
+U+100000..U+10FFFF F4     80..8F 80..BF 80..BF
+]]
+local function utf8next(s, n)
+    if n > #s then
+        return
+    end
+    if strmatch(s, "^[\0-\x7F]", n) then
+        return n+1, utf8byte(s, n)
+    elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then
+        return n+2, utf8byte(s, n)
+    elseif strmatch(s, "^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]", n) then
+        return n+3, utf8byte(s, n)
+    elseif strmatch(s, "^[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then
+        return n+4, utf8byte(s, n)
+    else
+        return n+1 --invaild
+    end
+end
+
+local function utf8codes(s)
+    return utf8next, s, 1
+end
+
+return function (what, replace)
+    local tobyte, tochar
+    if what == "be" then
+        tobyte = be_tobyte
+        tochar = be_tochar
+    else
+        tobyte = le_tobyte
+        tochar = le_tochar
+    end
+    local utf8replace  = replace and utf8char(replace)
+    local utf16replace = replace and utf16char(tochar, replace)
+    local function toutf8(s)
+        local r = {}
+        for _, code in utf16codes(s, tobyte) do
+            if code == nil then
+                if replace then
+                    r[#r+1] = utf8replace
+                else
+                    error "invalid UTF-16 code"
+                end
+            else
+                r[#r+1] = utf8char(code)
+            end
+        end
+        return tconcat(r)
+    end
+    local function fromutf8(s)
+        local r = {}
+        for _, code in utf8codes(s) do
+            if code == nil then
+                if replace then
+                    r[#r+1] = utf16replace
+                else
+                    error "invalid UTF-8 code"
+                end
+            else
+                r[#r+1] = utf16char(tochar, code)
+            end
+        end
+        return tconcat(r)
+    end
+    return {
+        toutf8 = toutf8,
+        fromutf8 = fromutf8,
+    }
+end
diff --git a/script/encoder/utf16be.lua b/script/encoder/utf16be.lua
deleted file mode 100644
index 5fc19b2c..00000000
--- a/script/encoder/utf16be.lua
+++ /dev/null
@@ -1,46 +0,0 @@
-local function tochar(code)
-    return string.char((code >> 8) & 0xFF, code & 0xFF)
-end
-
-local function tobyte(s, i)
-    local h, l = string.byte(s, i, i+1)
-    return (h << 8) | l
-end
-
-local function char(code)
-    if code <= 0xFFFF then
-        return tochar(code)
-    end
-    code = code - 0x10000
-    return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
-end
-
-local m = {}
-
-function m.encode(s)
-    local r = {}
-    for _, c in utf8.codes(s, true) do
-        r[#r+1] = char(c)
-    end
-    return table.concat(r)
-end
-
-function m.decode(s)
-    local r = {}
-    local i = 1
-    while i < #s do
-        local code1 = tobyte(s, i)
-        if code1 >= 0xD800 and code1 < 0xE000 then
-            i = i + 2
-            local code2 = tobyte(s, i)
-            local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
-            r[#r+1] = utf8.char(code)
-        else
-            r[#r+1] = utf8.char(code1)
-        end
-        i = i + 2
-    end
-    return table.concat(r)
-end
-
-return m
diff --git a/script/encoder/utf16le.lua b/script/encoder/utf16le.lua
deleted file mode 100644
index d51b4cfb..00000000
--- a/script/encoder/utf16le.lua
+++ /dev/null
@@ -1,46 +0,0 @@
-local function tochar(code)
-    return string.char(code & 0xFF, (code >> 8) & 0xFF)
-end
-
-local function tobyte(s, i)
-    local l, h = string.byte(s, i, i+1)
-    return (h << 8) | l
-end
-
-local function char(code)
-    if code <= 0xFFFF then
-        return tochar(code)
-    end
-    code = code - 0x10000
-    return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
-end
-
-local m = {}
-
-function m.encode(s)
-    local r = {}
-    for _, c in utf8.codes(s, true) do
-        r[#r+1] = char(c)
-    end
-    return table.concat(r)
-end
-
-function m.decode(s)
-    local r = {}
-    local i = 1
-    while i < #s do
-        local code1 = tobyte(s, i)
-        if code1 >= 0xD800 and code1 < 0xE000 then
-            i = i + 2
-            local code2 = tobyte(s, i)
-            local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
-            r[#r+1] = utf8.char(code)
-        else
-            r[#r+1] = utf8.char(code1)
-        end
-        i = i + 2
-    end
-    return table.concat(r)
-end
-
-return m