summaryrefslogtreecommitdiff
path: root/Userland
diff options
context:
space:
mode:
authorLuke Wilde <lukew@serenityos.org>2022-02-11 20:38:44 +0000
committerAndreas Kling <kling@serenityos.org>2022-02-12 12:53:28 +0100
commit0e0f98a45e32f39d7f703269a8cabff1010843ca (patch)
tree8932e8e0ad7cf494c4885b6e2d9cd7ff3271e590 /Userland
parent835a344337ae8de897e6c4be9e82b834b340060b (diff)
downloadserenity-0e0f98a45e32f39d7f703269a8cabff1010843ca.zip
LibTextCodec: Add x-user-defined decoder
It's a pretty simple charset: the bottom 128 bytes (0x00-0x7F) are standard ASCII, while the top 128 bytes (0x80-0xFF) are mapped to a portion of the Unicode Private Use Area, specifically 0xF780-0xF7FF. This is used by Google Maps for certain blobs.
Diffstat (limited to 'Userland')
-rw-r--r--Userland/Libraries/LibTextCodec/Decoder.cpp25
-rw-r--r--Userland/Libraries/LibTextCodec/Decoder.h5
2 files changed, 30 insertions, 0 deletions
diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp
index a212136fa3..35be9c52ef 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@@ -20,6 +20,7 @@ CyrillicDecoder s_cyrillic_decoder;
Koi8RDecoder s_koi8r_decoder;
Latin9Decoder s_latin9_decoder;
TurkishDecoder s_turkish_decoder;
+XUserDefinedDecoder s_x_user_defined_decoder;
}
Decoder* decoder_for(const String& a_encoding)
@@ -44,6 +45,8 @@ Decoder* decoder_for(const String& a_encoding)
return &s_latin9_decoder;
if (encoding.value().equals_ignoring_case("windows-1254"))
return &s_turkish_decoder;
+ if (encoding.value().equals_ignoring_case("x-user-defined"))
+ return &s_x_user_defined_decoder;
}
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
return nullptr;
@@ -466,4 +469,26 @@ void TurkishDecoder::process(StringView input, Function<void(u32)> on_code_point
}
}
+// https://encoding.spec.whatwg.org/#x-user-defined-decoder
+void XUserDefinedDecoder::process(StringView input, Function<void(u32)> on_code_point)
+{
+ auto convert_x_user_defined_to_utf8 = [](u8 ch) -> u32 {
+ // 2. If byte is an ASCII byte, return a code point whose value is byte.
+ // https://infra.spec.whatwg.org/#ascii-byte
+ // An ASCII byte is a byte in the range 0x00 (NUL) to 0x7F (DEL), inclusive.
+ // NOTE: This doesn't check for ch >= 0x00, as that would always be true due to being unsigned.
+ if (ch <= 0x7f)
+ return ch;
+
+ // 3. Return a code point whose value is 0xF780 + byte − 0x80.
+ return 0xF780 + ch - 0x80;
+ };
+
+ for (auto ch : input) {
+ on_code_point(convert_x_user_defined_to_utf8(ch));
+ }
+
+ // 1. If byte is end-of-queue, return finished.
+}
+
}
diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h
index a4b1e68dd2..7c7c47c7fb 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.h
+++ b/Userland/Libraries/LibTextCodec/Decoder.h
@@ -67,6 +67,11 @@ public:
virtual void process(StringView, Function<void(u32)> on_code_point) override;
};
+class XUserDefinedDecoder final : public Decoder {
+public:
+ virtual void process(StringView, Function<void(u32)> on_code_point) override;
+};
+
Decoder* decoder_for(String const& encoding);
Optional<String> get_standardized_encoding(const String& encoding);