diff --git a/Meta/test_pdf.py b/Meta/test_pdf.py index c33fdc454cc..45df83249d8 100755 --- a/Meta/test_pdf.py +++ b/Meta/test_pdf.py @@ -14,7 +14,9 @@ One of those zip files in unzipped makes for a good input folder. import argparse import collections +import dataclasses import glob +import json import multiprocessing import os import random @@ -26,6 +28,14 @@ Result = collections.namedtuple( 'Result', ['filename', 'returncode', 'stdout', 'stderr']) +@dataclasses.dataclass +class Issues: + filenames: [str] + filename_to_issues: {str: [int]} + num_pages: int + count: int + + def elide_aslr(s): return re.sub(rb'\b0x[0-9a-f]+\b', b'0xc3ns0r3d', s) @@ -36,7 +46,7 @@ def elide_parser_offset(s): def test_pdf(filename): pdf_path = os.path.join(os.path.dirname(__file__), '../Build/lagom/bin/pdf') - r = subprocess.run([pdf_path, '--debugging-stats', filename], + r = subprocess.run([pdf_path, '--debugging-stats', '--json', filename], capture_output=True) return Result(filename, r.returncode, r.stdout, elide_parser_offset(elide_aslr(r.stderr))) @@ -60,15 +70,29 @@ def main(): results = multiprocessing.Pool().map(test_pdf, files) num_files_without_issues = 0 + num_files_with_password = 0 + num_files_with_issues = 0 failed_files = [] num_crashes = 0 stack_to_files = {} + issues = {} for r in results: - print(r.filename) - print(r.stdout.decode('utf-8')) if r.returncode == 0: - if b'no issues found' in r.stdout: + if b'PDF requires password' in r.stderr: + num_files_with_password += 1 + continue + + j = json.loads(r.stdout.decode('utf-8')) + if not j['issues']: num_files_without_issues += 1 + else: + num_files_with_issues += 1 + for diag in j['issues']: + issue = issues.setdefault(diag, Issues([], {}, 0, 0)) + issue.filenames.append(r.filename) + issue.filename_to_issues[r.filename] = j['issues'][diag] + issue.num_pages += len(j['issues'][diag]) + issue.count += sum(a * b for (a, b) in j['issues'][diag]) continue if r.returncode == 1: failed_files.append(r.filename) @@ -76,10 +100,26 @@ def main(): num_crashes += 1 stack_to_files.setdefault(r.stderr, []).append(r.filename) - print('Top 5 crashiest stacks') + percent = 100 * num_files_with_issues / len(results) + print(f'{len(issues)} distinct issues, in {num_files_with_issues} files ({percent}%):') + issue_keys = list(issues.keys()) + issue_keys.sort(reverse=True, key=lambda x: len(issues[x].filenames)) + for issue_key in issue_keys: + issue = issues[issue_key] + print(issue_key, end='') + print(f', in {len(issue.filenames)} files, on {issue.num_pages} pages, {issue.count} times') + filenames = sorted(issue.filenames, reverse=True, key=lambda x: len(issue.filename_to_issues[x])) + for filename in filenames: + page_counts = issue.filename_to_issues[filename] + page_counts = ' '.join([f'{page} ({count}x)' if count > 1 else f'{page}' for (page, count) in page_counts]) + print(f' {filename} {page_counts}') + print() + print() + + print('Stacks:') keys = list(stack_to_files.keys()) keys.sort(key=lambda x: len(stack_to_files[x]), reverse=True) - for stack in reversed(keys[:5]): + for stack in reversed(keys): files = stack_to_files[stack] print(stack.decode('utf-8', 'backslashreplace'), end='') print(f'In {len(files)} files:') @@ -87,10 +127,6 @@ def main(): print(f' {file}') print() - percent = 100 * num_files_without_issues / len(results) - print(f'{num_files_without_issues} files without issues ({percent:.1f}%)') - print() - percent = 100 * num_crashes / len(results) print(f'{num_crashes} crashes ({percent:.1f}%)') print(f'{len(keys)} distinct crash stacks') @@ -100,6 +136,15 @@ def main(): print(f'{len(failed_files)} failed to open ({percent:.1f}%)') for f in failed_files: print(f' {f}') + print() + + percent = 100 * num_files_with_password / len(results) + print(f'{num_files_with_password} files with password ({percent:.1f}%)') + print() + + percent = 100 * num_files_without_issues / len(results) + print(f'{num_files_without_issues} files without issues ({percent:.1f}%)') + print() if __name__ == '__main__':