2023-10-20 14:00:27 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
'''Runs `pdf --debugging-stats` on a bunch of PDF files in parallel.
|
|
|
|
|
|
|
|
Give it one or more folders containing PDF files, and the optional -n flag
|
|
|
|
to pick a random subset of n PDFs:
|
|
|
|
|
|
|
|
test_pdf.py -n 200 ~/Downloads/0000 ~/src/pdffiles
|
|
|
|
|
|
|
|
https://pdfa.org/new-large-scale-pdf-corpus-now-publicly-available/ has
|
|
|
|
8 TB of test PDFs, organized in a bunch of zip files with 1000 PDFs each.
|
|
|
|
One of those zip files in unzipped makes for a good input folder.
|
|
|
|
'''
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import collections
|
|
|
|
import glob
|
|
|
|
import multiprocessing
|
|
|
|
import os
|
|
|
|
import random
|
|
|
|
import re
|
|
|
|
import subprocess
|
|
|
|
|
|
|
|
|
|
|
|
Result = collections.namedtuple(
|
|
|
|
'Result', ['filename', 'returncode', 'stdout', 'stderr'])
|
|
|
|
|
|
|
|
|
|
|
|
def elide_aslr(s):
|
|
|
|
return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s)
|
|
|
|
|
|
|
|
|
2023-10-21 15:49:39 +00:00
|
|
|
def elide_parser_offset(s):
|
|
|
|
return re.sub(rb'\bParser error at offset [0-9]+:', b'Parser error:', s)
|
|
|
|
|
|
|
|
|
2023-10-20 14:00:27 +00:00
|
|
|
def test_pdf(filename):
|
|
|
|
pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf')
|
|
|
|
r = subprocess.run([pdf_path, '--debugging-stats', filename],
|
|
|
|
capture_output=True)
|
2023-10-21 15:49:39 +00:00
|
|
|
return Result(filename, r.returncode, r.stdout,
|
|
|
|
elide_parser_offset(elide_aslr(r.stderr)))
|
2023-10-20 14:00:27 +00:00
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
epilog=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
|
|
|
|
)
|
|
|
|
parser.add_argument('input', nargs='+', help='input directories')
|
|
|
|
parser.add_argument('-n', type=int, help='render at most n pdfs')
|
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
|
|
files = []
|
|
|
|
for input_directory in args.input:
|
|
|
|
files += glob.glob(os.path.join(input_directory, '*.pdf'))
|
|
|
|
if args.n is not None:
|
|
|
|
random.seed(42)
|
|
|
|
files = random.sample(files, k=args.n)
|
|
|
|
|
|
|
|
results = multiprocessing.Pool().map(test_pdf, files)
|
|
|
|
|
2023-11-05 21:46:37 +00:00
|
|
|
num_files_without_issues = 0
|
2023-10-23 06:46:57 +00:00
|
|
|
failed_files = []
|
2023-10-20 14:00:27 +00:00
|
|
|
num_crashes = 0
|
|
|
|
stack_to_files = {}
|
|
|
|
for r in results:
|
|
|
|
print(r.filename)
|
|
|
|
print(r.stdout.decode('utf-8'))
|
2023-10-23 06:46:57 +00:00
|
|
|
if r.returncode == 0:
|
2023-11-05 21:46:37 +00:00
|
|
|
if b'no issues found' in r.stdout:
|
|
|
|
num_files_without_issues += 1
|
2023-10-23 06:46:57 +00:00
|
|
|
continue
|
|
|
|
if r.returncode == 1:
|
|
|
|
failed_files.append(r.filename)
|
|
|
|
else:
|
2023-10-20 14:00:27 +00:00
|
|
|
num_crashes += 1
|
|
|
|
stack_to_files.setdefault(r.stderr, []).append(r.filename)
|
|
|
|
|
|
|
|
print('Top 5 crashiest stacks')
|
|
|
|
keys = list(stack_to_files.keys())
|
|
|
|
keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True)
|
|
|
|
for stack in reversed(keys[:5]):
|
|
|
|
files = stack_to_files[stack]
|
2023-10-21 15:49:39 +00:00
|
|
|
print(stack.decode('utf-8', 'backslashreplace'), end='')
|
2023-10-20 14:00:27 +00:00
|
|
|
print(f'In {len(files)} files:')
|
|
|
|
for file in files:
|
|
|
|
print(f' {file}')
|
|
|
|
print()
|
|
|
|
|
2023-11-05 21:46:37 +00:00
|
|
|
percent = 100 * num_files_without_issues / len(results)
|
|
|
|
print(f'{num_files_without_issues} files without issues ({percent:.1f}%)')
|
|
|
|
print()
|
|
|
|
|
2023-10-20 14:00:27 +00:00
|
|
|
percent = 100 * num_crashes / len(results)
|
|
|
|
print(f'{num_crashes} crashes ({percent:.1f}%)')
|
2023-10-21 15:49:39 +00:00
|
|
|
print(f'{len(keys)} distinct crash stacks')
|
2023-10-20 14:00:27 +00:00
|
|
|
|
2023-10-23 06:46:57 +00:00
|
|
|
percent = 100 * len(failed_files) / len(results)
|
|
|
|
print()
|
|
|
|
print(f'{len(failed_files)} failed to open ({percent:.1f}%)')
|
|
|
|
for f in failed_files:
|
|
|
|
print(f' {f}')
|
|
|
|
|
2023-10-20 14:00:27 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
main()
|