LibTextCodec: Add decoder function that overrides given decoder on BOM

This functions takes a user-provided decoder and will only use it if no BOM is in the input. If there is a BOM, it will ignore the given decoder and instead decode the input with the appropriate Unicode decoder for the detected BOM. This is only to be used where it's specifically needed, for example XHR uses this for compatibility with deployed content. As such, it has an obnoxious name to discourage usage.
author: Luke Wilde <lukew@serenityos.org> 2022-02-11 21:02:29 +0000
committer: Andreas Kling <kling@serenityos.org> 2022-02-12 12:53:28 +0100
commit: 835a344337ae8de897e6c4be9e82b834b340060b (patch)
tree: 8693413104689e442937ee452f26b252a47c0151 /Userland/Libraries/LibTextCodec
parent: 94965ba28d9592d7c68f6e438bbc1e244533fc3c (diff)
download: serenity-835a344337ae8de897e6c4be9e82b834b340060b.zip
2 files changed, 28 insertions, 0 deletions
diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp
index 0aba9c2599..a212136fa3 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@@ -176,6 +176,30 @@ Decoder* bom_sniff_to_decoder(StringView input)
     return nullptr;
 }
 
+// https://encoding.spec.whatwg.org/#decode
+String convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(Decoder& fallback_decoder, StringView input)
+{
+    Decoder* actual_decoder = &fallback_decoder;
+
+    // 1. Let BOMEncoding be the result of BOM sniffing ioQueue.
+    // 2. If BOMEncoding is non-null:
+    if (auto* unicode_decoder = bom_sniff_to_decoder(input); unicode_decoder) {
+        // 1. Set encoding to BOMEncoding.
+        actual_decoder = unicode_decoder;
+
+        // 2. Read three bytes from ioQueue, if BOMEncoding is UTF-8; otherwise read two bytes. (Do nothing with those bytes.)
+        // FIXME: I imagine this will be pretty slow for large inputs, as it's regenerating the input without the first 2/3 bytes.
+        input = input.substring_view(unicode_decoder == &s_utf8_decoder ? 3 : 2);
+    }
+
+    VERIFY(actual_decoder);
+
+    // FIXME: 3. Process a queue with an instance of encoding’s decoder, ioQueue, output, and "replacement".
+    //        This isn't the exact same as the spec, especially the error mode of "replacement", which we don't have the concept of yet.
+    // 4. Return output.
+    return actual_decoder->to_utf8(input);
+}
+
 String Decoder::to_utf8(StringView input)
 {
     StringBuilder builder(input.length());
diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h
index 7edf2633a1..a4b1e68dd2 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.h
+++ b/Userland/Libraries/LibTextCodec/Decoder.h
@@ -73,4 +73,8 @@ Optional<String> get_standardized_encoding(const String& encoding);
 // This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
 Decoder* bom_sniff_to_decoder(StringView);
 
+// NOTE: This has an obnoxious name to discourage usage. Only use this if you absolutely must! For example, XHR in LibWeb uses this.
+// This will use the given decoder unless there is a byte order mark in the input, in which we will instead use the appropriate Unicode decoder.
+String convert_input_to_utf8_using_given_decoder_unless_there_is_a_byte_order_mark(Decoder&, StringView);
+
 }
author	Luke Wilde <lukew@serenityos.org>	2022-02-11 21:02:29 +0000
committer	Andreas Kling <kling@serenityos.org>	2022-02-12 12:53:28 +0100
commit	835a344337ae8de897e6c4be9e82b834b340060b (patch)
tree	8693413104689e442937ee452f26b252a47c0151 /Userland/Libraries/LibTextCodec
parent	94965ba28d9592d7c68f6e438bbc1e244533fc3c (diff)
download	serenity-835a344337ae8de897e6c4be9e82b834b340060b.zip