summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJelle Raaijmakers <jelle@gmta.nl>2022-03-08 14:27:11 +0100
committerLinus Groh <mail@linusgroh.de>2022-03-08 14:51:06 +0100
commit9c2a7c0e03fcfcf32c89b7e51f921c88afbe41cf (patch)
treea448a09f328bc7df3dba4b338c68bb0263c2eeef
parentefd9c70d94980add804f1001331b05c5bda4b23b (diff)
downloadserenity-9c2a7c0e03fcfcf32c89b7e51f921c88afbe41cf.zip
LibTextCodec: Add support for the UTF16-LE encoding
-rw-r--r--Userland/Libraries/LibTextCodec/Decoder.cpp29
-rw-r--r--Userland/Libraries/LibTextCodec/Decoder.h7
2 files changed, 33 insertions, 3 deletions
diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp
index 35be9c52ef..2855bff677 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@@ -14,6 +15,7 @@ namespace {
Latin1Decoder s_latin1_decoder;
UTF8Decoder s_utf8_decoder;
UTF16BEDecoder s_utf16be_decoder;
+UTF16LEDecoder s_utf16le_decoder;
Latin2Decoder s_latin2_decoder;
HebrewDecoder s_hebrew_decoder;
CyrillicDecoder s_cyrillic_decoder;
@@ -33,6 +35,8 @@ Decoder* decoder_for(const String& a_encoding)
return &s_utf8_decoder;
if (encoding.value().equals_ignoring_case("utf-16be"))
return &s_utf16be_decoder;
+ if (encoding.value().equals_ignoring_case("utf-16le"))
+ return &s_utf16le_decoder;
if (encoding.value().equals_ignoring_case("iso-8859-2"))
return &s_latin2_decoder;
if (encoding.value().equals_ignoring_case("windows-1255"))
@@ -172,8 +176,7 @@ Decoder* bom_sniff_to_decoder(StringView input)
case 0xFE: // UTF-16BE
return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
case 0xFF: // UTF-16LE
- // FIXME: There is currently no UTF-16LE decoder.
- TODO();
+ return bytes[1] == 0xFE ? &s_utf16le_decoder : nullptr;
}
return nullptr;
@@ -241,9 +244,29 @@ String UTF16BEDecoder::to_utf8(StringView input)
{
// Discard the BOM
auto bomless_input = input;
- if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
+ if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
bomless_input = input.substring_view(2);
+
+ StringBuilder builder(bomless_input.length() / 2);
+ process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
+ return builder.to_string();
+}
+
+void UTF16LEDecoder::process(StringView input, Function<void(u32)> on_code_point)
+{
+ size_t utf16_length = input.length() - (input.length() % 2);
+ for (size_t i = 0; i < utf16_length; i += 2) {
+ u16 code_point = input[i] | (input[i + 1] << 8);
+ on_code_point(code_point);
}
+}
+
+String UTF16LEDecoder::to_utf8(StringView input)
+{
+ // Discard the BOM
+ auto bomless_input = input;
+ if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
+ bomless_input = input.substring_view(2);
StringBuilder builder(bomless_input.length() / 2);
process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h
index 7c7c47c7fb..110648b2d6 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.h
+++ b/Userland/Libraries/LibTextCodec/Decoder.h
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2020-2021, Andreas Kling <kling@serenityos.org>
+ * Copyright (c) 2022, Jelle Raaijmakers <jelle@gmta.nl>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@@ -32,6 +33,12 @@ public:
virtual String to_utf8(StringView) override;
};
+class UTF16LEDecoder final : public Decoder {
+public:
+ virtual void process(StringView, Function<void(u32)> on_code_point) override;
+ virtual String to_utf8(StringView) override;
+};
+
class Latin1Decoder final : public Decoder {
public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;