test_pdf.py: Output tweaks

* Count files that are password-protected
* Use `pdf --json` to groups by missing feature and then by file
  instead of by file first, feature second
* Count files that render with issues
* Print number of files without issues last
* Always print all crash stacks
This commit is contained in:
Nico Weber 2023-11-09 22:02:54 +01:00 committed by Andreas Kling
parent ed7549c64f
commit 8f4966fc5c
Notes: sideshowbarker 2024-07-17 05:02:42 +09:00

View file

@ -14,7 +14,9 @@ One of those zip files in unzipped makes for a good input folder.
import argparse import argparse
import collections import collections
import dataclasses
import glob import glob
import json
import multiprocessing import multiprocessing
import os import os
import random import random
@ -26,6 +28,14 @@ Result = collections.namedtuple(
'Result', ['filename', 'returncode', 'stdout', 'stderr']) 'Result', ['filename', 'returncode', 'stdout', 'stderr'])
@dataclasses.dataclass
class Issues:
filenames: [str]
filename_to_issues: {str: [int]}
num_pages: int
count: int
def elide_aslr(s): def elide_aslr(s):
return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s) return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s)
@ -36,7 +46,7 @@ def elide_parser_offset(s):
def test_pdf(filename): def test_pdf(filename):
pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf') pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf')
r = subprocess.run([pdf_path, '--debugging-stats', filename], r = subprocess.run([pdf_path, '--debugging-stats', '--json', filename],
capture_output=True) capture_output=True)
return Result(filename, r.returncode, r.stdout, return Result(filename, r.returncode, r.stdout,
elide_parser_offset(elide_aslr(r.stderr))) elide_parser_offset(elide_aslr(r.stderr)))
@ -60,15 +70,29 @@ def main():
results = multiprocessing.Pool().map(test_pdf, files) results = multiprocessing.Pool().map(test_pdf, files)
num_files_without_issues = 0 num_files_without_issues = 0
num_files_with_password = 0
num_files_with_issues = 0
failed_files = [] failed_files = []
num_crashes = 0 num_crashes = 0
stack_to_files = {} stack_to_files = {}
issues = {}
for r in results: for r in results:
print(r.filename)
print(r.stdout.decode('utf-8'))
if r.returncode == 0: if r.returncode == 0:
if b'no issues found' in r.stdout: if b'PDF requires password' in r.stderr:
num_files_with_password += 1
continue
j = json.loads(r.stdout.decode('utf-8'))
if not j['issues']:
num_files_without_issues += 1 num_files_without_issues += 1
else:
num_files_with_issues += 1
for diag in j['issues']:
issue = issues.setdefault(diag, Issues([], {}, 0, 0))
issue.filenames.append(r.filename)
issue.filename_to_issues[r.filename] = j['issues'][diag]
issue.num_pages += len(j['issues'][diag])
issue.count += sum(a * b for (a, b) in j['issues'][diag])
continue continue
if r.returncode == 1: if r.returncode == 1:
failed_files.append(r.filename) failed_files.append(r.filename)
@ -76,10 +100,26 @@ def main():
num_crashes += 1 num_crashes += 1
stack_to_files.setdefault(r.stderr, []).append(r.filename) stack_to_files.setdefault(r.stderr, []).append(r.filename)
print('Top 5 crashiest stacks') percent = 100 * num_files_with_issues / len(results)
print(f'{len(issues)} distinct issues, in {num_files_with_issues} files ({percent}%):')
issue_keys = list(issues.keys())
issue_keys.sort(reverse=True, key=lambda x: len(issues[x].filenames))
for issue_key in issue_keys:
issue = issues[issue_key]
print(issue_key, end='')
print(f', in {len(issue.filenames)} files, on {issue.num_pages} pages, {issue.count} times')
filenames = sorted(issue.filenames, reverse=True, key=lambda x: len(issue.filename_to_issues[x]))
for filename in filenames:
page_counts = issue.filename_to_issues[filename]
page_counts = ' '.join([f'{page} ({count}x)' if count > 1 else f'{page}' for (page, count) in page_counts])
print(f' {filename} {page_counts}')
print()
print()
print('Stacks:')
keys = list(stack_to_files.keys()) keys = list(stack_to_files.keys())
keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True) keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True)
for stack in reversed(keys[:5]): for stack in reversed(keys):
files = stack_to_files[stack] files = stack_to_files[stack]
print(stack.decode('utf-8', 'backslashreplace'), end='') print(stack.decode('utf-8', 'backslashreplace'), end='')
print(f'In {len(files)} files:') print(f'In {len(files)} files:')
@ -87,10 +127,6 @@ def main():
print(f' {file}') print(f' {file}')
print() print()
percent = 100 * num_files_without_issues / len(results)
print(f'{num_files_without_issues} files without issues ({percent:.1f}%)')
print()
percent = 100 * num_crashes / len(results) percent = 100 * num_crashes / len(results)
print(f'{num_crashes} crashes ({percent:.1f}%)') print(f'{num_crashes} crashes ({percent:.1f}%)')
print(f'{len(keys)} distinct crash stacks') print(f'{len(keys)} distinct crash stacks')
@ -100,6 +136,15 @@ def main():
print(f'{len(failed_files)} failed to open ({percent:.1f}%)') print(f'{len(failed_files)} failed to open ({percent:.1f}%)')
for f in failed_files: for f in failed_files:
print(f' {f}') print(f' {f}')
print()
percent = 100 * num_files_with_password / len(results)
print(f'{num_files_with_password} files with password ({percent:.1f}%)')
print()
percent = 100 * num_files_without_issues / len(results)
print(f'{num_files_without_issues} files without issues ({percent:.1f}%)')
print()
if __name__ == '__main__': if __name__ == '__main__':