LibPDF: Simplify Encoding to align with simple font requirements

All "Simple Fonts" in PDF (all but Type0 fonts) have the property that glyphs are selected with single byte character codes. This means that the Encoding objects should use u8 for representing these character codes. Moreover, and as mentioned in a previous commit, there is no need to store the unicode code point associated with a character (which was in turn wrongly associated to a glyph). This commit greatly simplifies the Encoding class. Namely it: * Removes the unnecessary CharDescriptor class. * Changes the internal maps to be u8 -> FlyString and vice-versa, effectively providing two-way lookups. * Adds a new method to set a two-way u8 -> FlyString mapping and uses it in all possible places. * Simplified the creation of Encoding objects. * Changes how the WinAnsi special treatment for bullet points is implemented.
author: Rodrigo Tobar <rtobarc@gmail.com> 2023-01-23 23:56:43 +0800
committer: Andreas Kling <kling@serenityos.org> 2023-02-02 14:50:38 +0100
commit: 286e3e6872e9612b7e419f6aea3ee0ba5703bc3e (patch)
tree: 64dc7ac688023b123fbed80055e207c539c25b6c /Userland/Libraries/LibPDF
parent: fb0c3a9e18cbd55423b1dc780e5847a5202aa3e5 (diff)
download: serenity-286e3e6872e9612b7e419f6aea3ee0ba5703bc3e.zip
4 files changed, 43 insertions, 64 deletions
diff --git a/Userland/Libraries/LibPDF/Encoding.cpp b/Userland/Libraries/LibPDF/Encoding.cpp
index 9c951da309..c21767afa3 100644
--- a/Userland/Libraries/LibPDF/Encoding.cpp
+++ b/Userland/Libraries/LibPDF/Encoding.cpp
@@ -11,15 +11,9 @@
 
 namespace PDF {
 
-PDFErrorOr<NonnullRefPtr<Encoding>> Encoding::create(HashMap<u16, CharDescriptor> descriptors)
+NonnullRefPtr<Encoding> Encoding::create()
 {
-    auto encoding = adopt_ref(*new Encoding());
-    encoding->m_descriptors = descriptors;
-
-    for (auto& descriptor : descriptors)
-        encoding->m_name_mapping.set(descriptor.value.name, descriptor.value.code_point);
-
-    return encoding;
+    return adopt_ref(*new Encoding());
 }
 
 PDFErrorOr<NonnullRefPtr<Encoding>> Encoding::from_object(Document* document, NonnullRefPtr<Object> const& obj)
@@ -49,8 +43,8 @@ PDFErrorOr<NonnullRefPtr<Encoding>> Encoding::from_object(Document* document, No
 
     auto encoding = adopt_ref(*new Encoding());
 
-    encoding->m_descriptors = base_encoding->descriptors();
-    encoding->m_name_mapping = base_encoding->name_mapping();
+    encoding->m_descriptors = base_encoding->m_descriptors;
+    encoding->m_name_mapping = base_encoding->m_name_mapping;
 
     auto differences_array = TRY(dict->get_array(document, CommonNames::Differences));
 
@@ -66,8 +60,7 @@ PDFErrorOr<NonnullRefPtr<Encoding>> Encoding::from_object(Document* document, No
             VERIFY(!first);
             auto& object = item.get<NonnullRefPtr<Object>>();
             auto name = object->cast<NameObject>()->name();
-
-            encoding->m_descriptors.set(current_code_point, { name, base_encoding->m_name_mapping.ensure(name) });
+            encoding->set(current_code_point, name);
             current_code_point++;
         }
     }
@@ -75,13 +68,18 @@ PDFErrorOr<NonnullRefPtr<Encoding>> Encoding::from_object(Document* document, No
     return encoding;
 }
 
+void Encoding::set(CharCodeType char_code, DeprecatedFlyString const& glyph_name)
+{
+    m_descriptors.set(char_code, glyph_name);
+    m_name_mapping.set(glyph_name, char_code);
+}
+
 NonnullRefPtr<Encoding> Encoding::standard_encoding()
 {
     static NonnullRefPtr<Encoding> encoding = adopt_ref(*new Encoding());
     if (encoding->m_descriptors.is_empty()) {
 #define ENUMERATE(name, standard_code, mac_code, win_code, pdf_code) \
-    encoding->m_descriptors.set(standard_code, { #name, 0 });        \
-    encoding->m_name_mapping.set(#name, standard_code);
+    encoding->set(standard_code, #name);
         ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE)
 #undef ENUMERATE
     }
@@ -94,8 +92,7 @@ NonnullRefPtr<Encoding> Encoding::mac_encoding()
     static NonnullRefPtr<Encoding> encoding = adopt_ref(*new Encoding());
     if (encoding->m_descriptors.is_empty()) {
 #define ENUMERATE(name, standard_code, mac_code, win_code, pdf_code) \
-    encoding->m_descriptors.set(mac_code, { #name, 0 });             \
-    encoding->m_name_mapping.set(#name, mac_code);
+    encoding->set(mac_code, #name);
         ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE)
 #undef ENUMERATE
     }
@@ -108,13 +105,20 @@ NonnullRefPtr<Encoding> Encoding::windows_encoding()
     static NonnullRefPtr<Encoding> encoding = adopt_ref(*new Encoding());
     if (encoding->m_descriptors.is_empty()) {
 #define ENUMERATE(name, standard_code, mac_code, win_code, pdf_code) \
-    encoding->m_descriptors.set(win_code, { #name, 0 });             \
-    encoding->m_name_mapping.set(#name, win_code);
+    encoding->set(win_code, #name);
         ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE)
 #undef ENUMERATE
-        encoding->m_windows = true;
-    }
 
+        // PDF Annex D table D.2, note 3:
+        // In WinAnsiEncoding, all unused codes greater than 40 (octal) map to the bullet character. However, only
+        // code 225 (octal) shall be specifically assigned to the bullet character; other codes are subject to future re-assignment.
+        //
+        // Since CharCodeType is u8 *and* we need to include 255, we iterate in reverse order to have more readable code.
+        for (CharCodeType char_code = 255; char_code > 040; char_code--) {
+            if (!encoding->m_descriptors.contains(char_code))
+                encoding->set(char_code, "bullet");
+        }
+    }
     return encoding;
 }
 
@@ -123,8 +127,7 @@ NonnullRefPtr<Encoding> Encoding::pdf_doc_encoding()
     static NonnullRefPtr<Encoding> encoding = adopt_ref(*new Encoding());
     if (encoding->m_descriptors.is_empty()) {
 #define ENUMERATE(name, standard_code, mac_code, win_code, pdf_code) \
-    encoding->m_descriptors.set(pdf_code, { #name, 0 });             \
-    encoding->m_name_mapping.set(#name, pdf_code);
+    encoding->set(pdf_code, #name);
         ENUMERATE_LATIN_CHARACTER_SET(ENUMERATE)
 #undef ENUMERATE
     }
@@ -136,9 +139,8 @@ NonnullRefPtr<Encoding> Encoding::symbol_encoding()
 {
     static NonnullRefPtr<Encoding> encoding = adopt_ref(*new Encoding());
     if (encoding->m_descriptors.is_empty()) {
-#define ENUMERATE(name, code)                        \
-    encoding->m_descriptors.set(code, { #name, 0 }); \
-    encoding->m_name_mapping.set(#name, code);
+#define ENUMERATE(name, code) \
+    encoding->set(code, #name);
         ENUMERATE_SYMBOL_CHARACTER_SET(ENUMERATE)
 #undef ENUMERATE
     }
@@ -150,21 +152,14 @@ NonnullRefPtr<Encoding> Encoding::zapf_encoding()
 {
     static NonnullRefPtr<Encoding> encoding = adopt_ref(*new Encoding());
     if (encoding->m_descriptors.is_empty()) {
-#define ENUMERATE(name, code)                        \
-    encoding->m_descriptors.set(code, { #name, 0 }); \
-    encoding->m_name_mapping.set(#name, code);
+#define ENUMERATE(name, code) \
+    encoding->set(code, #name);
         ENUMERATE_ZAPF_DINGBATS_CHARACTER_SET(ENUMERATE)
 #undef ENUMERATE
     }
-
     return encoding;
 }
 
-CharDescriptor const& Encoding::get_char_code_descriptor(u16 char_code) const
-{
-    return const_cast<Encoding*>(this)->m_descriptors.ensure(char_code);
-}
-
 u16 Encoding::get_char_code(DeprecatedString const& name) const
 {
     auto code_iterator = m_name_mapping.find(name);
@@ -173,12 +168,4 @@ u16 Encoding::get_char_code(DeprecatedString const& name) const
     return 0;
 }
 
-bool Encoding::should_map_to_bullet(u16 char_code) const
-{
-    // PDF Annex D table D.2, note 3:
-    // In WinAnsiEncoding, all unused codes greater than 40 (octal) map to the bullet character. However, only
-    // code 225 (octal) shall be specifically assigned to the bullet character; other codes are subject to future re-assignment.
-    return m_windows && char_code > 040 && !m_descriptors.contains(char_code);
-}
-
 }
diff --git a/Userland/Libraries/LibPDF/Encoding.h b/Userland/Libraries/LibPDF/Encoding.h
index c2f6af07d8..d25a52b82c 100644
--- a/Userland/Libraries/LibPDF/Encoding.h
+++ b/Userland/Libraries/LibPDF/Encoding.h
@@ -625,14 +625,10 @@
 
 namespace PDF {
 
-struct CharDescriptor {
-    DeprecatedString name;
-    u32 code_point;
-};
-
 class Encoding : public RefCounted<Encoding> {
 public:
-    static PDFErrorOr<NonnullRefPtr<Encoding>> create(HashMap<u16, CharDescriptor> descriptors);
+    using CharCodeType = u8;
+    static NonnullRefPtr<Encoding> create();
     static PDFErrorOr<NonnullRefPtr<Encoding>> from_object(Document*, NonnullRefPtr<Object> const&);
 
     static NonnullRefPtr<Encoding> standard_encoding();
@@ -642,17 +638,14 @@ public:
     static NonnullRefPtr<Encoding> symbol_encoding();
     static NonnullRefPtr<Encoding> zapf_encoding();
 
-    HashMap<u16, CharDescriptor> const& descriptors() const { return m_descriptors; }
-    HashMap<DeprecatedString, u16> const& name_mapping() const { return m_name_mapping; }
+    HashMap<DeprecatedString, CharCodeType> const& name_mapping() const { return m_name_mapping; }
 
     u16 get_char_code(DeprecatedString const&) const;
-    CharDescriptor const& get_char_code_descriptor(u16 char_code) const;
-
-    bool should_map_to_bullet(u16 char_code) const;
+    void set(CharCodeType char_code, DeprecatedFlyString const& glyph_name);
 
 protected:
-    HashMap<u16, CharDescriptor> m_descriptors;
-    HashMap<DeprecatedString, u16> m_name_mapping;
+    HashMap<CharCodeType, DeprecatedFlyString> m_descriptors;
+    HashMap<DeprecatedString, CharCodeType> m_name_mapping;
 
     bool m_windows { false };
 };
diff --git a/Userland/Libraries/LibPDF/Fonts/CFF.cpp b/Userland/Libraries/LibPDF/Fonts/CFF.cpp
index 18e2290787..2aad6a1b07 100644
--- a/Userland/Libraries/LibPDF/Fonts/CFF.cpp
+++ b/Userland/Libraries/LibPDF/Fonts/CFF.cpp
@@ -121,19 +121,19 @@ PDFErrorOr<NonnullRefPtr<CFF>> CFF::create(ReadonlyBytes const& cff_bytes, RefPt
         }
         cff->set_encoding(move(encoding));
     } else {
-        HashMap<u16, CharDescriptor> descriptors;
+        auto encoding = Encoding::create();
         for (size_t i = 0; i < glyphs.size(); i++) {
             if (i == 0) {
                 TRY(cff->add_glyph(0, move(glyphs[0])));
-                descriptors.set(0, CharDescriptor { ".notdef", 0 });
+                encoding->set(0, ".notdef");
                 continue;
             }
             auto code = encoding_codes[i - 1];
             auto char_name = charset[i - 1];
             TRY(cff->add_glyph(code, move(glyphs[i])));
-            descriptors.set(code, CharDescriptor { char_name, code });
+            encoding->set(code, char_name);
         }
-        cff->set_encoding(TRY(Encoding::create(descriptors)));
+        cff->set_encoding(move(encoding));
     }
 
     return cff;
diff --git a/Userland/Libraries/LibPDF/Fonts/PS1FontProgram.cpp b/Userland/Libraries/LibPDF/Fonts/PS1FontProgram.cpp
index 99e70ba799..534ccdd77a 100644
--- a/Userland/Libraries/LibPDF/Fonts/PS1FontProgram.cpp
+++ b/Userland/Libraries/LibPDF/Fonts/PS1FontProgram.cpp
@@ -36,19 +36,18 @@ PDFErrorOr<NonnullRefPtr<Type1FontProgram>> PS1FontProgram::create(ReadonlyBytes
         if (TRY(parse_word(reader)) == "StandardEncoding") {
             font_program->set_encoding(Encoding::standard_encoding());
         } else {
-            HashMap<u16, CharDescriptor> descriptors;
-
+            auto encoding = Encoding::create();
             while (reader.remaining()) {
                 auto word = TRY(parse_word(reader));
                 if (word == "readonly") {
                     break;
                 } else if (word == "dup") {
-                    u32 char_code = TRY(parse_int(reader));
+                    u8 char_code = TRY(parse_int(reader));
                     auto name = TRY(parse_word(reader));
-                    descriptors.set(char_code, { name.starts_with('/') ? name.substring_view(1) : name.view(), char_code });
+                    encoding->set(char_code, name.starts_with('/') ? name.substring_view(1) : name.view());
                 }
             }
-            font_program->set_encoding(TRY(Encoding::create(descriptors)));
+            font_program->set_encoding(move(encoding));
         }
     }
author	Rodrigo Tobar <rtobarc@gmail.com>	2023-01-23 23:56:43 +0800
committer	Andreas Kling <kling@serenityos.org>	2023-02-02 14:50:38 +0100
commit	286e3e6872e9612b7e419f6aea3ee0ba5703bc3e (patch)
tree	64dc7ac688023b123fbed80055e207c539c25b6c /Userland/Libraries/LibPDF
parent	fb0c3a9e18cbd55423b1dc780e5847a5202aa3e5 (diff)
download	serenity-286e3e6872e9612b7e419f6aea3ee0ba5703bc3e.zip