From 04cb00dc9a0104b2ad8cd0d52748997bdb52b8a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20Offenh=C3=A4user?= Date: Fri, 19 Aug 2022 19:46:30 +0200 Subject: LibPDF: Fix handling of differences array in custom encodings When looking up differences in the specified encoding, we previously didn't recognize a lot of characters, namely those that are referred to by a string in the PDF itself, like "/germandbls". We now create a mapping of those characters to the code points they are referring to, and correctly look them up when needed. --- Userland/Libraries/LibPDF/Encoding.cpp | 47 +++++++++++++++------------------- Userland/Libraries/LibPDF/Encoding.h | 1 + 2 files changed, 22 insertions(+), 26 deletions(-) diff --git a/Userland/Libraries/LibPDF/Encoding.cpp b/Userland/Libraries/LibPDF/Encoding.cpp index a7f2ce7a23..ad02369f2e 100644 --- a/Userland/Libraries/LibPDF/Encoding.cpp +++ b/Userland/Libraries/LibPDF/Encoding.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2022, Matthew Olsson + * Copyright (c) 2022, Julian Offenhäuser * * SPDX-License-Identifier: BSD-2-Clause */ @@ -36,17 +37,10 @@ PDFErrorOr> Encoding::from_object(Document* document, No } auto encoding = adopt_ref(*new Encoding()); - - // Build a String -> Character mapping for handling the differences map - HashMap base_encoding_name_mapping; - - for (auto& [code_point, descriptor] : base_encoding->descriptors()) { + for (auto& [code_point, descriptor] : base_encoding->descriptors()) encoding->m_descriptors.set(code_point, descriptor); - base_encoding_name_mapping.set(descriptor.name, descriptor); - } auto differences_array = TRY(dict->get_array(document, CommonNames::Differences)); - HashMap differences_map; u16 current_code_point = 0; bool first = true; @@ -61,12 +55,7 @@ PDFErrorOr> Encoding::from_object(Document* document, No auto& object = item.get>(); auto name = object->cast()->name(); - auto character = base_encoding_name_mapping.get(name); - // FIXME: This should always have a value. This does cause crashes in certain - // documents, so we must be missing something here. - if (character.has_value()) - encoding->m_descriptors.set(current_code_point, character.value()); - + encoding->m_descriptors.set(current_code_point, { name, base_encoding->m_name_mapping.ensure(name) }); current_code_point++; } } @@ -78,9 +67,10 @@ NonnullRefPtr Encoding::standard_encoding() { static NonnullRefPtr encoding = adopt_ref(*new Encoding()); if (encoding->m_descriptors.is_empty()) { -#define ENUMERATE(string, name, standard_code, mac_code, win_code, pdf_code) \ - auto name##_code_point = *Utf8View(string##sv).begin(); \ - encoding->m_descriptors.set(standard_code, { string, name##_code_point }); +#define ENUMERATE(string, name, standard_code, mac_code, win_code, pdf_code) \ + auto name##_code_point = *Utf8View(string##sv).begin(); \ + encoding->m_descriptors.set(standard_code, { string, name##_code_point }); \ + encoding->m_name_mapping.set(#name, name##_code_point); ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE) #undef ENUMERATE } @@ -94,7 +84,8 @@ NonnullRefPtr Encoding::mac_encoding() if (encoding->m_descriptors.is_empty()) { #define ENUMERATE(string, name, standard_code, mac_code, win_code, pdf_code) \ auto name##_code_point = *Utf8View(string##sv).begin(); \ - encoding->m_descriptors.set(mac_code, { string, name##_code_point }); + encoding->m_descriptors.set(mac_code, { string, name##_code_point }); \ + encoding->m_name_mapping.set(#name, name##_code_point); ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE) #undef ENUMERATE } @@ -108,7 +99,8 @@ NonnullRefPtr Encoding::windows_encoding() if (encoding->m_descriptors.is_empty()) { #define ENUMERATE(string, name, standard_code, mac_code, win_code, pdf_code) \ auto name##_code_point = *Utf8View(string##sv).begin(); \ - encoding->m_descriptors.set(win_code, { string, name##_code_point }); + encoding->m_descriptors.set(win_code, { string, name##_code_point }); \ + encoding->m_name_mapping.set(#name, name##_code_point); ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE) #undef ENUMERATE } @@ -122,7 +114,8 @@ NonnullRefPtr Encoding::pdf_doc_encoding() if (encoding->m_descriptors.is_empty()) { #define ENUMERATE(string, name, standard_code, mac_code, win_code, pdf_code) \ auto name##_code_point = *Utf8View(string##sv).begin(); \ - encoding->m_descriptors.set(pdf_code, { string, name##_code_point }); + encoding->m_descriptors.set(pdf_code, { string, name##_code_point }); \ + encoding->m_name_mapping.set(#name, name##_code_point); ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE) #undef ENUMERATE } @@ -134,9 +127,10 @@ NonnullRefPtr Encoding::symbol_encoding() { static NonnullRefPtr encoding = adopt_ref(*new Encoding()); if (encoding->m_descriptors.is_empty()) { -#define ENUMERATE(string, name, code) \ - auto name##_code_point = *Utf8View(string##sv).begin(); \ - encoding->m_descriptors.set(code, { string, name##_code_point }); +#define ENUMERATE(string, name, code) \ + auto name##_code_point = *Utf8View(string##sv).begin(); \ + encoding->m_descriptors.set(code, { string, name##_code_point }); \ + encoding->m_name_mapping.set(#name, name##_code_point); ENUMERATE_SYMBOL_CHARACTER_SET(ENUMERATE) #undef ENUMERATE } @@ -148,9 +142,10 @@ NonnullRefPtr Encoding::zapf_encoding() { static NonnullRefPtr encoding = adopt_ref(*new Encoding()); if (encoding->m_descriptors.is_empty()) { -#define ENUMERATE(string, name, code) \ - auto name##_code_point = *Utf8View(string##sv).begin(); \ - encoding->m_descriptors.set(code, { string, name##_code_point }); +#define ENUMERATE(string, name, code) \ + auto name##_code_point = *Utf8View(string##sv).begin(); \ + encoding->m_descriptors.set(code, { string, name##_code_point }); \ + encoding->m_name_mapping.set(#name, name##_code_point); ENUMERATE_ZAPF_DINGBATS_CHARACTER_SET(ENUMERATE) #undef ENUMERATE } diff --git a/Userland/Libraries/LibPDF/Encoding.h b/Userland/Libraries/LibPDF/Encoding.h index 38209d9aae..6fd7a5376a 100644 --- a/Userland/Libraries/LibPDF/Encoding.h +++ b/Userland/Libraries/LibPDF/Encoding.h @@ -647,6 +647,7 @@ public: protected: HashMap m_descriptors; + HashMap m_name_mapping; }; } -- cgit v1.2.3