diff options
author | Sam Atkins <atkinssj@serenityos.org> | 2021-09-15 15:33:30 +0100 |
---|---|---|
committer | Andreas Kling <kling@serenityos.org> | 2021-09-15 17:00:18 +0200 |
commit | d7ffa5142498095644055f11aba99bad351221d5 (patch) | |
tree | c91197780be26e9b2d583b84bba7b95e720af0fd /Userland/Libraries/LibTextCodec | |
parent | 9ec02e7137dc7b8b075cbf8189182a591ff50f6c (diff) | |
download | serenity-d7ffa5142498095644055f11aba99bad351221d5.zip |
LibTextCodec: Ignore BYTE ORDER MARK at the start of utf8/16 strings
Before, this was getting included as part of the output text, which was
confusing the HTML parser. Nobody needs the BOM after we have identified
the codec, so now we remove it when converting to UTF-8.
Diffstat (limited to 'Userland/Libraries/LibTextCodec')
-rw-r--r-- | Userland/Libraries/LibTextCodec/Decoder.cpp | 18 |
1 files changed, 15 insertions, 3 deletions
diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index 262489ee7a..7a1bc03c01 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -208,7 +208,13 @@ void UTF8Decoder::process(const StringView& input, Function<void(u32)> on_code_p String UTF8Decoder::to_utf8(const StringView& input) { - return input; + // Discard the BOM + auto bomless_input = input; + if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) { + bomless_input = input.substring_view(3); + } + + return bomless_input; } void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_code_point) @@ -222,8 +228,14 @@ void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_cod String UTF16BEDecoder::to_utf8(const StringView& input) { - StringBuilder builder(input.length() / 2); - process(input, [&builder](u32 c) { builder.append_code_point(c); }); + // Discard the BOM + auto bomless_input = input; + if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) { + bomless_input = input.substring_view(2); + } + + StringBuilder builder(bomless_input.length() / 2); + process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); }); return builder.to_string(); } |