summaryrefslogtreecommitdiff
path: root/Userland/Libraries/LibPDF
diff options
context:
space:
mode:
authorMatthew Olsson <matthewcolsson@gmail.com>2021-05-23 21:27:17 -0700
committerAli Mohammad Pur <Ali.mpfard@gmail.com>2021-05-25 00:24:09 +0430
commit67b65dffa838bb657067f3a40ff6b7f8a600fe1b (patch)
treef7c2ea003be32b16f44da0b25ce6ffbee5d88e20 /Userland/Libraries/LibPDF
parenta08922d2f6fa3d86aa8af0bbbfba95d657bea634 (diff)
downloadserenity-67b65dffa838bb657067f3a40ff6b7f8a600fe1b.zip
LibPDF: Handle string encodings
Strings can be encoded in either UTF16-BE or UTF8. In either case, there are a few initial bytes which specify the encoding that must be checked and also removed from the final string.
Diffstat (limited to 'Userland/Libraries/LibPDF')
-rw-r--r--Userland/Libraries/LibPDF/CMakeLists.txt2
-rw-r--r--Userland/Libraries/LibPDF/Parser.cpp25
2 files changed, 23 insertions, 4 deletions
diff --git a/Userland/Libraries/LibPDF/CMakeLists.txt b/Userland/Libraries/LibPDF/CMakeLists.txt
index 6d52363eb6..aa8b86d850 100644
--- a/Userland/Libraries/LibPDF/CMakeLists.txt
+++ b/Userland/Libraries/LibPDF/CMakeLists.txt
@@ -8,4 +8,4 @@ set(SOURCES
)
serenity_lib(LibPDF pdf)
-target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx)
+target_link_libraries(LibPDF LibC LibCore LibIPC LibGfx LibTextCodec)
diff --git a/Userland/Libraries/LibPDF/Parser.cpp b/Userland/Libraries/LibPDF/Parser.cpp
index f78f5211c0..7aa434c102 100644
--- a/Userland/Libraries/LibPDF/Parser.cpp
+++ b/Userland/Libraries/LibPDF/Parser.cpp
@@ -9,6 +9,7 @@
#include <LibPDF/Document.h>
#include <LibPDF/Filter.h>
#include <LibPDF/Parser.h>
+#include <LibTextCodec/Decoder.h>
#include <ctype.h>
#include <math.h>
@@ -422,9 +423,27 @@ NonnullRefPtr<StringObject> Parser::parse_string()
{
ScopeGuard guard([&] { consume_whitespace(); });
- if (m_reader.matches('('))
- return make_object<StringObject>(parse_literal_string(), false);
- return make_object<StringObject>(parse_hex_string(), true);
+ String string;
+ bool is_binary_string;
+
+ if (m_reader.matches('(')) {
+ string = parse_literal_string();
+ is_binary_string = false;
+ } else {
+ string = parse_hex_string();
+ is_binary_string = true;
+ }
+
+ if (string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
+ // The string is encoded in UTF16-BE
+ string = TextCodec::decoder_for("utf-16be")->to_utf8(string.substring(2));
+ } else if (string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
+ // The string is encoded in UTF-8. This is the default anyways, but if these bytes
+ // are explicitly included, we have to trim them
+ string = string.substring(3);
+ }
+
+ return make_object<StringObject>(string, is_binary_string);
}
String Parser::parse_literal_string()