LibTextCodec: Ignore BYTE ORDER MARK at the start of utf8/16 strings

Before, this was getting included as part of the output text, which was confusing the HTML parser. Nobody needs the BOM after we have identified the codec, so now we remove it when converting to UTF-8.
author: Sam Atkins <atkinssj@serenityos.org> 2021-09-15 15:33:30 +0100
committer: Andreas Kling <kling@serenityos.org> 2021-09-15 17:00:18 +0200
commit: d7ffa5142498095644055f11aba99bad351221d5 (patch)
tree: c91197780be26e9b2d583b84bba7b95e720af0fd /Userland/Libraries/LibTextCodec
parent: 9ec02e7137dc7b8b075cbf8189182a591ff50f6c (diff)
download: serenity-d7ffa5142498095644055f11aba99bad351221d5.zip
1 files changed, 15 insertions, 3 deletions
diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp
index 262489ee7a..7a1bc03c01 100644
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@@ -208,7 +208,13 @@ void UTF8Decoder::process(const StringView& input, Function<void(u32)> on_code_p
 
 String UTF8Decoder::to_utf8(const StringView& input)
 {
-    return input;
+    // Discard the BOM
+    auto bomless_input = input;
+    if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
+        bomless_input = input.substring_view(3);
+    }
+
+    return bomless_input;
 }
 
 void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
@@ -222,8 +228,14 @@ void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_cod
 
 String UTF16BEDecoder::to_utf8(const StringView& input)
 {
-    StringBuilder builder(input.length() / 2);
-    process(input, [&builder](u32 c) { builder.append_code_point(c); });
+    // Discard the BOM
+    auto bomless_input = input;
+    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
+        bomless_input = input.substring_view(2);
+    }
+
+    StringBuilder builder(bomless_input.length() / 2);
+    process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
     return builder.to_string();
 }
author	Sam Atkins <atkinssj@serenityos.org>	2021-09-15 15:33:30 +0100
committer	Andreas Kling <kling@serenityos.org>	2021-09-15 17:00:18 +0200
commit	d7ffa5142498095644055f11aba99bad351221d5 (patch)
tree	c91197780be26e9b2d583b84bba7b95e720af0fd /Userland/Libraries/LibTextCodec
parent	9ec02e7137dc7b8b075cbf8189182a591ff50f6c (diff)
download	serenity-d7ffa5142498095644055f11aba99bad351221d5.zip