From a76fd2d8f927171fddb563e879875f0a68d2667f Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 7 Dec 2021 22:56:46 +0000 Subject: [PATCH] Use multiprocessing --- extract_local.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/extract_local.py b/extract_local.py index 5064dbd..33de505 100644 --- a/extract_local.py +++ b/extract_local.py @@ -1,5 +1,6 @@ import gzip import json +import multiprocessing import os from glob import glob from itertools import islice @@ -20,9 +21,16 @@ def get_records(): yield json.loads(line) +def process(record): + return list(fetch_process_warc_records([record])) + + def run(): - records = get_records() - processed = fetch_process_warc_records(islice(records, 50)) + records = islice(get_records(), 1000) + + with multiprocessing.Pool(20) as pool: + processed = pool.map(process, records) + with gzip.open(EXTRACTS_PATH / 'data.json.gz', 'wt') as output_file: for row in processed: output_file.write(json.dumps(row) + '\n')