ソースを参照

LibPDF: Stop converting encodings in object parser

Per 1.7 spec 3.8.1, there are multiple logical text string types:
* text strings
* ASCII strings
* byte strings

Text strings can be in UTF-16BE, PDFDocEncoding, or (since PDF 2.0)
UTF-8.

But byte strings shouldn't be converted but treated as binary
data.

This makes us no longer convert strings used for drawing page text.
TABLE 5.6 "Text-showing operators" lists the operands for text-showing
operators as just "string", not "text string" (even though these strings
confusingly are called "text strings" in the body text), so not doing
this there is correct (and matches other viewers).

We also no longer incorrectly convert strings used for cypto data
(such as passwords), if they start with an UTF-16BE or UTF-8 marker.

No behavior change for outlines and info dict entries.

https://pdfa.org/understanding-utf-8-in-pdf-2-0/ has a good overview of
this.

(ASCII strings only contain ASCII characters and behave the same
anyways.)
Nico Weber 1 年間 前
コミット
e39a790c82

+ 30 - 7
Userland/Libraries/LibPDF/Document.cpp

@@ -7,6 +7,7 @@
 #include <LibPDF/CommonNames.h>
 #include <LibPDF/CommonNames.h>
 #include <LibPDF/Document.h>
 #include <LibPDF/Document.h>
 #include <LibPDF/Parser.h>
 #include <LibPDF/Parser.h>
+#include <LibTextCodec/Decoder.h>
 
 
 namespace PDF {
 namespace PDF {
 
 
@@ -36,32 +37,32 @@ DeprecatedString OutlineItem::to_deprecated_string(int indent) const
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::title() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::title() const
 {
 {
-    return get(CommonNames::Title);
+    return get_text(CommonNames::Title);
 }
 }
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::author() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::author() const
 {
 {
-    return get(CommonNames::Author);
+    return get_text(CommonNames::Author);
 }
 }
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::subject() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::subject() const
 {
 {
-    return get(CommonNames::Subject);
+    return get_text(CommonNames::Subject);
 }
 }
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::keywords() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::keywords() const
 {
 {
-    return get(CommonNames::Keywords);
+    return get_text(CommonNames::Keywords);
 }
 }
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::creator() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::creator() const
 {
 {
-    return get(CommonNames::Creator);
+    return get_text(CommonNames::Creator);
 }
 }
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::producer() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::producer() const
 {
 {
-    return get(CommonNames::Producer);
+    return get_text(CommonNames::Producer);
 }
 }
 
 
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::creation_date() const
 PDFErrorOr<Optional<DeprecatedString>> InfoDict::creation_date() const
@@ -74,6 +75,28 @@ PDFErrorOr<Optional<DeprecatedString>> InfoDict::modification_date() const
     return get(CommonNames::ModDate);
     return get(CommonNames::ModDate);
 }
 }
 
 
+PDFErrorOr<Optional<DeprecatedString>> InfoDict::get_text(DeprecatedFlyString const& name) const
+{
+    return TRY(get(name)).map(Document::text_string_to_utf8);
+}
+
+DeprecatedString Document::text_string_to_utf8(DeprecatedString const& text_string)
+{
+    if (text_string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
+        // The string is encoded in UTF16-BE
+        return TextCodec::decoder_for("utf-16be"sv)->to_utf8(text_string).release_value_but_fixme_should_propagate_errors().to_deprecated_string();
+    }
+
+    if (text_string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
+        // The string is encoded in UTF-8.
+        return text_string.substring(3);
+    }
+
+    // FIXME: Convert from PDFDocEncoding to UTF-8.
+
+    return text_string;
+}
+
 PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
 PDFErrorOr<NonnullRefPtr<Document>> Document::create(ReadonlyBytes bytes)
 {
 {
     auto parser = adopt_ref(*new DocumentParser({}, bytes));
     auto parser = adopt_ref(*new DocumentParser({}, bytes));
@@ -544,7 +567,7 @@ PDFErrorOr<NonnullRefPtr<OutlineItem>> Document::build_outline_item(NonnullRefPt
         outline_item->children = move(children);
         outline_item->children = move(children);
     }
     }
 
 
-    outline_item->title = TRY(outline_item_dict->get_string(this, CommonNames::Title))->string();
+    outline_item->title = text_string_to_utf8(TRY(outline_item_dict->get_string(this, CommonNames::Title))->string());
 
 
     if (outline_item_dict->contains(CommonNames::Count))
     if (outline_item_dict->contains(CommonNames::Count))
         outline_item->count = outline_item_dict->get_value(CommonNames::Count).get<int>();
         outline_item->count = outline_item_dict->get_value(CommonNames::Count).get<int>();

+ 8 - 1
Userland/Libraries/LibPDF/Document.h

@@ -39,7 +39,7 @@ struct Destination {
 struct OutlineItem final : public RefCounted<OutlineItem> {
 struct OutlineItem final : public RefCounted<OutlineItem> {
     RefPtr<OutlineItem> parent;
     RefPtr<OutlineItem> parent;
     Vector<NonnullRefPtr<OutlineItem>> children;
     Vector<NonnullRefPtr<OutlineItem>> children;
-    DeprecatedString title;
+    DeprecatedString title; // Already converted to UTF-8.
     i32 count { 0 };
     i32 count { 0 };
     Destination dest;
     Destination dest;
     Gfx::Color color { Color::NamedColor::Black }; // 'C' in the PDF spec
     Gfx::Color color { Color::NamedColor::Black }; // 'C' in the PDF spec
@@ -66,6 +66,8 @@ public:
     {
     {
     }
     }
 
 
+    // These all return strings that are already converted to UTF-8.
+
     PDFErrorOr<Optional<DeprecatedString>> title() const;
     PDFErrorOr<Optional<DeprecatedString>> title() const;
     PDFErrorOr<Optional<DeprecatedString>> author() const;
     PDFErrorOr<Optional<DeprecatedString>> author() const;
     PDFErrorOr<Optional<DeprecatedString>> subject() const;
     PDFErrorOr<Optional<DeprecatedString>> subject() const;
@@ -89,6 +91,8 @@ private:
         return TRY(m_info_dict->get_string(m_document, name))->string();
         return TRY(m_info_dict->get_string(m_document, name))->string();
     }
     }
 
 
+    PDFErrorOr<Optional<DeprecatedString>> get_text(DeprecatedFlyString const& name) const;
+
     WeakPtr<Document> m_document;
     WeakPtr<Document> m_document;
     NonnullRefPtr<DictObject> m_info_dict;
     NonnullRefPtr<DictObject> m_info_dict;
 };
 };
@@ -97,6 +101,9 @@ class Document final
     : public RefCounted<Document>
     : public RefCounted<Document>
     , public Weakable<Document> {
     , public Weakable<Document> {
 public:
 public:
+    // Converts a text string (PDF 1.7 spec, 3.8.1. "String Types") to UTF-8.
+    static DeprecatedString text_string_to_utf8(DeprecatedString const&);
+
     static PDFErrorOr<NonnullRefPtr<Document>> create(ReadonlyBytes bytes);
     static PDFErrorOr<NonnullRefPtr<Document>> create(ReadonlyBytes bytes);
 
 
     // If a security handler is present, it is the caller's responsibility to ensure
     // If a security handler is present, it is the caller's responsibility to ensure

+ 0 - 12
Userland/Libraries/LibPDF/Parser.cpp

@@ -9,7 +9,6 @@
 #include <LibPDF/Document.h>
 #include <LibPDF/Document.h>
 #include <LibPDF/Filter.h>
 #include <LibPDF/Filter.h>
 #include <LibPDF/Parser.h>
 #include <LibPDF/Parser.h>
-#include <LibTextCodec/Decoder.h>
 #include <ctype.h>
 #include <ctype.h>
 
 
 namespace PDF {
 namespace PDF {
@@ -262,17 +261,6 @@ PDFErrorOr<NonnullRefPtr<StringObject>> Parser::parse_string()
     if (m_document->security_handler() && m_enable_encryption)
     if (m_document->security_handler() && m_enable_encryption)
         m_document->security_handler()->decrypt(string_object, m_current_reference_stack.last());
         m_document->security_handler()->decrypt(string_object, m_current_reference_stack.last());
 
 
-    auto unencrypted_string = string_object->string();
-
-    if (unencrypted_string.bytes().starts_with(Array<u8, 2> { 0xfe, 0xff })) {
-        // The string is encoded in UTF16-BE
-        string_object->set_string(TextCodec::decoder_for("utf-16be"sv)->to_utf8(unencrypted_string).release_value_but_fixme_should_propagate_errors().to_deprecated_string());
-    } else if (unencrypted_string.bytes().starts_with(Array<u8, 3> { 239, 187, 191 })) {
-        // The string is encoded in UTF-8. This is the default anyways, but if these bytes
-        // are explicitly included, we have to trim them
-        string_object->set_string(unencrypted_string.substring(3));
-    }
-
     return string_object;
     return string_object;
 }
 }