LibPDF: Implement png predictor decoding for flate filter

For flate and lzw filters, the data can be transformed by this
predictor function to make it compress better. For us this means that
we have to undo this step in order to get the right result.

Although this feature is meant for images, I found at least a few
documents that use it all over the place, making this step very
important.
This commit is contained in:
Julian Offenhäuser 2022-11-10 23:32:41 +01:00 committed by Andreas Kling
parent 4bd79338e8
commit c2ad29c85f
Notes: sideshowbarker 2024-07-17 04:19:25 +09:00
4 changed files with 137 additions and 15 deletions

View file

@ -18,6 +18,7 @@
A(BM) \
A(BaseEncoding) \
A(BaseFont) \
A(BitsPerComponent) \
A(BlackPoint) \
A(C) \
A(CA) \
@ -25,7 +26,9 @@
A(CalRGB) \
A(CIDSystemInfo) \
A(CIDToGIDMap) \
A(Colors) \
A(ColorSpace) \
A(Columns) \
A(Contents) \
A(Count) \
A(CropBox) \
@ -33,6 +36,7 @@
A(D) \
A(DW) \
A(DCTDecode) \
A(DecodeParms) \
A(DescendantFonts) \
A(Dest) \
A(Dests) \
@ -101,6 +105,7 @@
A(Pages) \
A(Parent) \
A(Pattern) \
A(Predictor) \
A(Prev) \
A(R) \
A(RI) \

View file

@ -12,8 +12,24 @@
namespace PDF {
ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encoding_type)
ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encoding_type, RefPtr<DictObject> decode_parms)
{
int predictor = 1;
int columns = 1;
int colors = 1;
int bits_per_component = 8;
if (decode_parms) {
if (decode_parms->contains(CommonNames::Predictor))
predictor = decode_parms->get_value(CommonNames::Predictor).get<int>();
if (decode_parms->contains(CommonNames::Columns))
columns = decode_parms->get_value(CommonNames::Columns).get<int>();
if (decode_parms->contains(CommonNames::Colors))
colors = decode_parms->get_value(CommonNames::Colors).get<int>();
if (decode_parms->contains(CommonNames::BitsPerComponent))
bits_per_component = decode_parms->get_value(CommonNames::BitsPerComponent).get<int>();
}
if (encoding_type == CommonNames::ASCIIHexDecode)
return decode_ascii_hex(bytes);
if (encoding_type == CommonNames::ASCII85Decode)
@ -21,7 +37,7 @@ ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encodin
if (encoding_type == CommonNames::LZWDecode)
return decode_lzw(bytes);
if (encoding_type == CommonNames::FlateDecode)
return decode_flate(bytes);
return decode_flate(bytes, predictor, columns, colors, bits_per_component);
if (encoding_type == CommonNames::RunLengthDecode)
return decode_run_length(bytes);
if (encoding_type == CommonNames::CCITTFaxDecode)
@ -35,7 +51,7 @@ ErrorOr<ByteBuffer> Filter::decode(ReadonlyBytes bytes, FlyString const& encodin
if (encoding_type == CommonNames::Crypt)
return decode_crypt(bytes);
return Error::from_string_literal("Unrecognized filter encoding");
return AK::Error::from_string_literal("Unrecognized filter encoding");
}
ErrorOr<ByteBuffer> Filter::decode_ascii_hex(ReadonlyBytes bytes)
@ -50,11 +66,11 @@ ErrorOr<ByteBuffer> Filter::decode_ascii_hex(ReadonlyBytes bytes)
for (size_t i = 0; i < bytes.size() / 2; ++i) {
auto const c1 = decode_hex_digit(static_cast<char>(bytes[i * 2]));
if (c1 >= 16)
return Error::from_string_literal("Hex string contains invalid digit");
return AK::Error::from_string_literal("Hex string contains invalid digit");
auto const c2 = decode_hex_digit(static_cast<char>(bytes[i * 2 + 1]));
if (c2 >= 16)
return Error::from_string_literal("Hex string contains invalid digit");
return AK::Error::from_string_literal("Hex string contains invalid digit");
output[i] = (c1 << 4) + c2;
}
@ -120,20 +136,95 @@ ErrorOr<ByteBuffer> Filter::decode_ascii85(ReadonlyBytes bytes)
return ByteBuffer::copy(buff.span());
};
ErrorOr<ByteBuffer> Filter::decode_png_prediction(Bytes bytes, int bytes_per_row)
{
int number_of_rows = bytes.size() / bytes_per_row;
ByteBuffer decoded;
decoded.ensure_capacity(bytes.size() - number_of_rows);
auto empty_row = TRY(ByteBuffer::create_zeroed(bytes_per_row));
auto previous_row = empty_row.data();
for (int row_index = 0; row_index < number_of_rows; ++row_index) {
auto row = bytes.data() + row_index * bytes_per_row;
u8 algorithm_tag = row[0];
switch (algorithm_tag) {
case 0:
break;
case 1:
for (int i = 2; i < bytes_per_row; ++i)
row[i] += row[i - 1];
break;
case 2:
for (int i = 1; i < bytes_per_row; ++i)
row[i] += previous_row[i];
break;
case 3:
for (int i = 1; i < bytes_per_row; ++i) {
u8 left = 0;
if (i > 1)
left = row[i - 1];
u8 above = previous_row[i];
row[i] += (left + above) / 2;
}
break;
case 4:
for (int i = 1; i < bytes_per_row; ++i) {
u8 left = 0;
u8 upper_left = 0;
if (i > 1) {
left = row[i - 1];
upper_left = previous_row[i - 1];
}
u8 above = previous_row[i];
u8 p = left + above - upper_left;
int left_distance = abs(p - left);
int above_distance = abs(p - above);
int upper_left_distance = abs(p - upper_left);
u8 paeth = min(left_distance, min(above_distance, upper_left_distance));
row[i] += paeth;
}
break;
default:
return AK::Error::from_string_literal("Unknown PNG algorithm tag");
}
previous_row = row;
decoded.append(row + 1, bytes_per_row - 1);
}
return decoded;
}
ErrorOr<ByteBuffer> Filter::decode_lzw(ReadonlyBytes)
{
dbgln("LZW decoding is not supported");
VERIFY_NOT_REACHED();
};
ErrorOr<ByteBuffer> Filter::decode_flate(ReadonlyBytes bytes)
ErrorOr<ByteBuffer> Filter::decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component)
{
// FIXME: The spec says Flate decoding is "based on" zlib, does that mean they
// aren't exactly the same?
auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2)).value();
if (predictor == 1)
return buff;
auto buff = Compress::DeflateDecompressor::decompress_all(bytes.slice(2));
VERIFY(buff.has_value());
return buff.value();
// Check if we are dealing with a PNG prediction
if (predictor == 2)
return AK::Error::from_string_literal("The TIFF predictor is not supported");
if (predictor < 10 || predictor > 15)
return AK::Error::from_string_literal("Invalid predictor value");
// Rows are always a whole number of bytes long, starting with an algorithm tag
int bytes_per_row = AK::ceil_div(columns * colors * bits_per_component, 8) + 1;
if (buff.size() % bytes_per_row)
return AK::Error::from_string_literal("Flate input data is not divisible into columns");
return decode_png_prediction(buff, bytes_per_row);
};
ErrorOr<ByteBuffer> Filter::decode_run_length(ReadonlyBytes)

View file

@ -9,18 +9,20 @@
#include <AK/ByteBuffer.h>
#include <AK/Error.h>
#include <AK/FlyString.h>
#include <LibPDF/ObjectDerivatives.h>
namespace PDF {
class Filter {
public:
static ErrorOr<ByteBuffer> decode(ReadonlyBytes bytes, FlyString const& encoding_type);
static ErrorOr<ByteBuffer> decode(ReadonlyBytes bytes, FlyString const& encoding_type, RefPtr<DictObject> decode_parms);
private:
static ErrorOr<ByteBuffer> decode_ascii_hex(ReadonlyBytes bytes);
static ErrorOr<ByteBuffer> decode_ascii85(ReadonlyBytes bytes);
static ErrorOr<ByteBuffer> decode_png_prediction(Bytes bytes, int bytes_per_row);
static ErrorOr<ByteBuffer> decode_lzw(ReadonlyBytes bytes);
static ErrorOr<ByteBuffer> decode_flate(ReadonlyBytes bytes);
static ErrorOr<ByteBuffer> decode_flate(ReadonlyBytes bytes, int predictor, int columns, int colors, int bits_per_component);
static ErrorOr<ByteBuffer> decode_run_length(ReadonlyBytes bytes);
static ErrorOr<ByteBuffer> decode_ccitt(ReadonlyBytes bytes);
static ErrorOr<ByteBuffer> decode_jbig2(ReadonlyBytes bytes);

View file

@ -488,8 +488,32 @@ PDFErrorOr<NonnullRefPtr<StreamObject>> Parser::parse_stream(NonnullRefPtr<DictO
filters.append(filter_object->cast<NameObject>()->name());
}
for (auto const& filter_type : filters)
stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filter_type));
// Every filter may get its own parameter dictionary
Vector<RefPtr<DictObject>> decode_parms_vector;
RefPtr<Object> decode_parms_object;
if (dict->contains(CommonNames::DecodeParms)) {
decode_parms_object = TRY(dict->get_object(m_document, CommonNames::DecodeParms));
if (decode_parms_object->is<ArrayObject>()) {
auto decode_parms_array = decode_parms_object->cast<ArrayObject>();
for (size_t i = 0; i < decode_parms_array->size(); ++i) {
// FIXME: This entry may be the null object instead
RefPtr<DictObject> decode_parms = decode_parms_array->at(i).get<NonnullRefPtr<Object>>()->cast<DictObject>();
decode_parms_vector.append(decode_parms);
}
} else {
decode_parms_vector.append(decode_parms_object->cast<DictObject>());
}
}
VERIFY(decode_parms_vector.is_empty() || decode_parms_vector.size() == filters.size());
for (size_t i = 0; i < filters.size(); ++i) {
RefPtr<DictObject> decode_parms;
if (!decode_parms_vector.is_empty())
decode_parms = decode_parms_vector.at(i);
stream_object->buffer() = TRY(Filter::decode(stream_object->bytes(), filters.at(i), decode_parms));
}
}
return stream_object;