summaryrefslogtreecommitdiff
path: root/script
diff options
context:
space:
mode:
author最萌小汐 <sumneko@hotmail.com>2021-11-03 10:59:33 +0800
committer最萌小汐 <sumneko@hotmail.com>2021-11-15 16:09:53 +0800
commit2833ab487689e2f2dc1b327717a68f0c2f3e6d46 (patch)
tree989100efea3b1ac161176fab422c637e125c4678 /script
parent9d2be68b46194a18132c57c26034823b1752a300 (diff)
downloadlua-language-server-2833ab487689e2f2dc1b327717a68f0c2f3e6d46.zip
fix #777
Diffstat (limited to 'script')
-rw-r--r--script/encoder/ansi.lua4
-rw-r--r--script/encoder/init.lua29
-rw-r--r--script/encoder/utf16.lua147
-rw-r--r--script/encoder/utf16be.lua46
-rw-r--r--script/encoder/utf16le.lua46
5 files changed, 164 insertions, 108 deletions
diff --git a/script/encoder/ansi.lua b/script/encoder/ansi.lua
index 1016e668..f5273c51 100644
--- a/script/encoder/ansi.lua
+++ b/script/encoder/ansi.lua
@@ -7,14 +7,14 @@ end
local m = {}
-function m.decode(text)
+function m.toutf8(text)
if not unicode then
return text
end
return unicode.a2u(text)
end
-function m.encode(text)
+function m.fromutf8(text)
if not unicode then
return text
end
diff --git a/script/encoder/init.lua b/script/encoder/init.lua
index d7753c1f..0011265a 100644
--- a/script/encoder/init.lua
+++ b/script/encoder/init.lua
@@ -1,6 +1,7 @@
local ansi = require 'encoder.ansi'
-local utf16le = require 'encoder.utf16le'
-local utf16be = require 'encoder.utf16be'
+local utf16 = require 'encoder.utf16'
+local utf16le = utf16('le', utf8.codepoint '�')
+local utf16be = utf16('be', utf8.codepoint '�')
---@alias encoder.encoding '"utf8"'|'"utf16"'|'"utf16le"'|'"utf16be"'
@@ -17,11 +18,11 @@ function m.len(encoding, s, i, j)
j = j or #s
if encoding == 'utf16'
or encoding == 'utf16' then
- local us = utf16le.encode(s:sub(i, j))
+ local us = utf16le.fromutf8(s:sub(i, j))
return #us // 2
end
if encoding == 'utf16be' then
- local us = utf16be.encode(s:sub(i, j))
+ local us = utf16be.fromutf8(s:sub(i, j))
return #us // 2
end
if encoding == 'utf8' then
@@ -43,8 +44,8 @@ function m.offset(encoding, s, n, i)
if not line:find '[\x80-\xff]' then
return n + i - 1
end
- local us = utf16le.encode(line)
- local os = utf16le.decode(us:sub(1, n * 2 - 2))
+ local us = utf16le.fromutf8(line)
+ local os = utf16le.toutf8(us:sub(1, n * 2 - 2))
return #os + i
end
if encoding == 'utf16be' then
@@ -52,8 +53,8 @@ function m.offset(encoding, s, n, i)
if not line:find '[\x80-\xff]' then
return n + i - 1
end
- local us = utf16be.encode(line)
- local os = utf16be.decode(us:sub(1, n * 2 - 2))
+ local us = utf16be.fromutf8(line)
+ local os = utf16be.toutf8(us:sub(1, n * 2 - 2))
return #os + i
end
if encoding == 'utf8' then
@@ -75,11 +76,11 @@ function m.encode(encoding, text, bom)
return text
end
if encoding == 'ansi' then
- return ansi.encode(text)
+ return ansi.fromutf8(text)
end
if encoding == 'utf16'
or encoding == 'utf16le' then
- text = utf16le.encode(text)
+ text = utf16le.fromutf8(text)
if bom == 'yes'
or bom == 'auto' then
text = '\xFF\xFE' .. text
@@ -87,7 +88,7 @@ function m.encode(encoding, text, bom)
return text
end
if encoding == 'utf16be' then
- text = utf16be.encode(text)
+ text = utf16be.fromutf8(text)
if bom == 'yes'
or bom == 'auto' then
text = '\xFE\xFF' .. text
@@ -106,14 +107,14 @@ function m.decode(encoding, text)
return text
end
if encoding == 'ansi' then
- return ansi.decode(text)
+ return ansi.toutf8(text)
end
if encoding == 'utf16'
or encoding == 'utf16le' then
- return utf16le.decode(text)
+ return utf16le.toutf8(text)
end
if encoding == 'utf16be' then
- return utf16be.decode(text)
+ return utf16be.toutf8(text)
end
log.error('Unsupport encode encoding:', encoding)
return text
diff --git a/script/encoder/utf16.lua b/script/encoder/utf16.lua
new file mode 100644
index 00000000..9e71de68
--- /dev/null
+++ b/script/encoder/utf16.lua
@@ -0,0 +1,147 @@
+local error = error
+local strchar = string.char
+local strbyte = string.byte
+local strmatch = string.match
+local utf8char = utf8.char
+local tconcat = table.concat
+
+local function be_tochar(code)
+ return strchar((code >> 8) & 0xFF, code & 0xFF)
+end
+
+local function be_tobyte(s, i)
+ local h, l = strbyte(s, i, i+1)
+ return (h << 8) | l
+end
+
+local function le_tochar(code)
+ return strchar(code & 0xFF, (code >> 8) & 0xFF)
+end
+
+local function le_tobyte(s, i)
+ local l, h = strbyte(s, i, i+1)
+ return (h << 8) | l
+end
+
+local function utf16char(tochar, code)
+ if code < 0x10000 then
+ return tochar(code)
+ else
+ code = code - 0x10000
+ return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
+ end
+end
+
+local function utf16next(s, n, tobyte)
+ if n > #s then
+ return
+ end
+ local code1 = tobyte(s, n)
+ if code1 < 0xD800 or code1 >= 0xE000 then
+ return n+2, code1
+ elseif code1 >= 0xD800 and code1 < 0xDC00 then
+ n = n + 2
+ if n > #s then
+ return n --invaild
+ end
+ local code2 = tobyte(s, n)
+ if code2 < 0xDC00 or code2 >= 0xE000 then
+ return n --invaild
+ end
+ local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
+ return n+2, code
+ else
+ return n+2 --invaild
+ end
+end
+
+local function utf16codes(s, tobyte)
+ return function (_, n)
+ return utf16next(s, n, tobyte)
+ end, s, 1
+end
+
+local _utf8byte = utf8.codes ""
+local function utf8byte(s, n)
+ local _, code = _utf8byte(s, n-1)
+ return code
+end
+
+--[[
+ U+0000.. U+007F 00..7F
+ U+0080.. U+07FF C2..DF 80..BF
+ U+0800.. U+0FFF E0 A0..BF 80..BF
+ U+1000.. U+CFFF E1..EC 80..BF 80..BF
+ U+D000.. U+D7FF ED 80..9F 80..BF
+ U+E000.. U+FFFF EE..EF 80..BF 80..BF
+ U+10000.. U+3FFFF F0 90..BF 80..BF 80..BF
+ U+40000.. U+FFFFF F1..F3 80..BF 80..BF 80..BF
+U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
+]]
+local function utf8next(s, n)
+ if n > #s then
+ return
+ end
+ if strmatch(s, "^[\0-\x7F]", n) then
+ return n+1, utf8byte(s, n)
+ elseif strmatch(s, "^[\xC2-\xDF][\x80-\xBF]", n) then
+ return n+2, utf8byte(s, n)
+ elseif strmatch(s, "^[\xE0-\xEF][\x80-\xBF][\x80-\xBF]", n) then
+ return n+3, utf8byte(s, n)
+ elseif strmatch(s, "^[\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]", n) then
+ return n+4, utf8byte(s, n)
+ else
+ return n+1 --invaild
+ end
+end
+
+local function utf8codes(s)
+ return utf8next, s, 1
+end
+
+return function (what, replace)
+ local tobyte, tochar
+ if what == "be" then
+ tobyte = be_tobyte
+ tochar = be_tochar
+ else
+ tobyte = le_tobyte
+ tochar = le_tochar
+ end
+ local utf8replace = replace and utf8char(replace)
+ local utf16replace = replace and utf16char(tochar, replace)
+ local function toutf8(s)
+ local r = {}
+ for _, code in utf16codes(s, tobyte) do
+ if code == nil then
+ if replace then
+ r[#r+1] = utf8replace
+ else
+ error "invalid UTF-16 code"
+ end
+ else
+ r[#r+1] = utf8char(code)
+ end
+ end
+ return tconcat(r)
+ end
+ local function fromutf8(s)
+ local r = {}
+ for _, code in utf8codes(s) do
+ if code == nil then
+ if replace then
+ r[#r+1] = utf16replace
+ else
+ error "invalid UTF-8 code"
+ end
+ else
+ r[#r+1] = utf16char(tochar, code)
+ end
+ end
+ return tconcat(r)
+ end
+ return {
+ toutf8 = toutf8,
+ fromutf8 = fromutf8,
+ }
+end
diff --git a/script/encoder/utf16be.lua b/script/encoder/utf16be.lua
deleted file mode 100644
index 5fc19b2c..00000000
--- a/script/encoder/utf16be.lua
+++ /dev/null
@@ -1,46 +0,0 @@
-local function tochar(code)
- return string.char((code >> 8) & 0xFF, code & 0xFF)
-end
-
-local function tobyte(s, i)
- local h, l = string.byte(s, i, i+1)
- return (h << 8) | l
-end
-
-local function char(code)
- if code <= 0xFFFF then
- return tochar(code)
- end
- code = code - 0x10000
- return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
-end
-
-local m = {}
-
-function m.encode(s)
- local r = {}
- for _, c in utf8.codes(s, true) do
- r[#r+1] = char(c)
- end
- return table.concat(r)
-end
-
-function m.decode(s)
- local r = {}
- local i = 1
- while i < #s do
- local code1 = tobyte(s, i)
- if code1 >= 0xD800 and code1 < 0xE000 then
- i = i + 2
- local code2 = tobyte(s, i)
- local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
- r[#r+1] = utf8.char(code)
- else
- r[#r+1] = utf8.char(code1)
- end
- i = i + 2
- end
- return table.concat(r)
-end
-
-return m
diff --git a/script/encoder/utf16le.lua b/script/encoder/utf16le.lua
deleted file mode 100644
index d51b4cfb..00000000
--- a/script/encoder/utf16le.lua
+++ /dev/null
@@ -1,46 +0,0 @@
-local function tochar(code)
- return string.char(code & 0xFF, (code >> 8) & 0xFF)
-end
-
-local function tobyte(s, i)
- local l, h = string.byte(s, i, i+1)
- return (h << 8) | l
-end
-
-local function char(code)
- if code <= 0xFFFF then
- return tochar(code)
- end
- code = code - 0x10000
- return tochar(0xD800 + (code >> 10))..tochar(0xDC00 + (code & 0x3FF))
-end
-
-local m = {}
-
-function m.encode(s)
- local r = {}
- for _, c in utf8.codes(s, true) do
- r[#r+1] = char(c)
- end
- return table.concat(r)
-end
-
-function m.decode(s)
- local r = {}
- local i = 1
- while i < #s do
- local code1 = tobyte(s, i)
- if code1 >= 0xD800 and code1 < 0xE000 then
- i = i + 2
- local code2 = tobyte(s, i)
- local code = 0x10000 + ((code1 - 0xD800) << 10) + ((code2 - 0xDC00) & 0x3FF)
- r[#r+1] = utf8.char(code)
- else
- r[#r+1] = utf8.char(code1)
- end
- i = i + 2
- end
- return table.concat(r)
-end
-
-return m