Use multiprocessing

This commit is contained in:
Daoud Clarke 2021-12-07 22:56:46 +00:00
parent 2d554b14e7
commit a76fd2d8f9

View file

@ -1,5 +1,6 @@
import gzip
import json
import multiprocessing
import os
from glob import glob
from itertools import islice
@ -20,9 +21,16 @@ def get_records():
yield json.loads(line)
def process(record):
return list(fetch_process_warc_records([record]))
def run():
records = get_records()
processed = fetch_process_warc_records(islice(records, 50))
records = islice(get_records(), 1000)
with multiprocessing.Pool(20) as pool:
processed = pool.map(process, records)
with gzip.open(EXTRACTS_PATH / 'data.json.gz', 'wt') as output_file:
for row in processed:
output_file.write(json.dumps(row) + '\n')