diff options
author | Matthew Olsson <matthewcolsson@gmail.com> | 2021-05-23 21:27:17 -0700 |
---|---|---|
committer | Ali Mohammad Pur <Ali.mpfard@gmail.com> | 2021-05-25 00:24:09 +0430 |
commit | 67b65dffa838bb657067f3a40ff6b7f8a600fe1b (patch) | |
tree | f7c2ea003be32b16f44da0b25ce6ffbee5d88e20 /Userland/Libraries/LibPDF | |
parent | a08922d2f6fa3d86aa8af0bbbfba95d657bea634 (diff) | |
download | serenity-67b65dffa838bb657067f3a40ff6b7f8a600fe1b.zip |
LibPDF: Handle string encodings
Strings can be encoded in either UTF16-BE or UTF8. In either case,
there are a few initial bytes which specify the encoding that must
be checked and also removed from the final string.
Diffstat (limited to 'Userland/Libraries/LibPDF')
-rw-r--r-- | Userland/Libraries/LibPDF/CMakeLists.txt | 2 | ||||
-rw-r--r-- | Userland/Libraries/LibPDF/Parser.cpp | 25 |
2 files changed, 23 insertions, 4 deletions
diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt index 6d52363eb6..aa8b86d850 100644 --- a/Userland/Libraries/LibPDF/CMakeLists.txt +++ b/Userland/Libraries/LibPDF/CMakeLists.txt @@ -8,4 +8,4 @@ set(SOURCES ) serenity_lib(LibPDF pdf) -target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx) +target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx LibTextCodec) diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp index f78f5211c0..7aa434c102 100644 --- a/Userland/Libraries/LibPDF/Parser.cpp +++ b/Userland/Libraries/LibPDF/Parser.cpp @@ -9,6 +9,7 @@ #include <LibPDF/Document.h> #include <LibPDF/Filter.h> #include <LibPDF/Parser.h> +#include <LibTextCodec/Decoder.h> #include <ctype.h> #include <math.h> @@ -422,9 +423,27 @@ NonnullRefPtr<StringObject> Parser::parse_string() { ScopeGuard guard([&] { consume_whitespace(); }); - if (m_reader.matches('(')) - return make_object<StringObject>(parse_literal_string(), false); - return make_object<StringObject>(parse_hex_string(), true); + String string; + bool is_binary_string; + + if (m_reader.matches('(')) { + string = parse_literal_string(); + is_binary_string = false; + } else { + string = parse_hex_string(); + is_binary_string = true; + } + + if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) { + // The string is encoded in UTF16-BE + string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2)); + } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) { + // The string is encoded in UTF-8. This is the default anyways, but if these bytes + // are explicitly included, we have to trim them + string = string.substring(3); + } + + return make_object<StringObject>(string, is_binary_string); } String Parser::parse_literal_string() |