LibTextCodec: Implement a Windows-1251 decoder

This encoding (a superset of ascii that adds in the cyrillic alphabet) is currently the third most used encoding on the web, and because cyrillic glyphs were added by Dmitrii Trifonov recently, we can now support it as well :^)
author: Idan Horowitz <idan.horowitz@gmail.com> 2021-05-01 18:18:26 +0300
committer: Linus Groh <mail@linusgroh.de> 2021-05-01 17:59:08 +0200
commit: 87cabda80d751dd1d36f9d23e06296b598898c41 (patch)
tree: 2d5bf05cfe98defac454fe810820e041d17cd3cc /Userland/Libraries/LibTextCodec
parent: 4b0098e52f860ac346c42ba8a3bfdebf1de37f56 (diff)
download: serenity-87cabda80d751dd1d36f9d23e06296b598898c41.zip
2 files changed, 38 insertions, 0 deletions
diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp
index f82bb029eb..37a28d5999 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@@ -51,6 +51,14 @@ HebrewDecoder& hebrew_decoder()
     return *decoder;
 }
 
+CyrillicDecoder& cyrillic_decoder()
+{
+    static CyrillicDecoder* decoder = nullptr;
+    if (!decoder)
+        decoder = new CyrillicDecoder;
+    return *decoder;
+}
+
 }
 
 Decoder* decoder_for(const String& a_encoding)
@@ -66,6 +74,8 @@ Decoder* decoder_for(const String& a_encoding)
         return &latin2_decoder();
     if (encoding.equals_ignoring_case("windows-1255"))
         return &hebrew_decoder();
+    if (encoding.equals_ignoring_case("windows-1251"))
+        return &cyrillic_decoder();
     dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
     return nullptr;
 }
@@ -304,4 +314,27 @@ String HebrewDecoder::to_utf8(const StringView& input)
     return builder.to_string();
 }
 
+String CyrillicDecoder::to_utf8(const StringView& input)
+{
+    static constexpr Array<u32, 128> translation_table = {
+        0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
+        0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
+        0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
+        0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
+        0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
+        0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
+        0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
+        0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
+    };
+    StringBuilder builder(input.length());
+    for (unsigned char ch : input) {
+        if (ch < 0x80) { // Superset of ASCII
+            builder.append(ch);
+        } else {
+            builder.append_code_point(translation_table[ch - 0x80]);
+        }
+    }
+    return builder.to_string();
+}
+
 }
diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h
index b7b5c0ab0e..6ea4147b7e 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.h
+++ b/Userland/Libraries/LibTextCodec/Decoder.h
@@ -43,6 +43,11 @@ public:
     virtual String to_utf8(const StringView&) override;
 };
 
+class CyrillicDecoder final : public Decoder {
+public:
+    virtual String to_utf8(const StringView&) override;
+};
+
 Decoder* decoder_for(const String& encoding);
 String get_standardized_encoding(const String& encoding);
 bool is_standardized_encoding(const String& encoding);
author	Idan Horowitz <idan.horowitz@gmail.com>	2021-05-01 18:18:26 +0300
committer	Linus Groh <mail@linusgroh.de>	2021-05-01 17:59:08 +0200
commit	87cabda80d751dd1d36f9d23e06296b598898c41 (patch)
tree	2d5bf05cfe98defac454fe810820e041d17cd3cc /Userland/Libraries/LibTextCodec
parent	4b0098e52f860ac346c42ba8a3bfdebf1de37f56 (diff)
download	serenity-87cabda80d751dd1d36f9d23e06296b598898c41.zip