Use multiprocessing
This commit is contained in:
parent
2d554b14e7
commit
a76fd2d8f9
1 changed files with 10 additions and 2 deletions
|
@ -1,5 +1,6 @@
|
|||
import gzip
|
||||
import json
|
||||
import multiprocessing
|
||||
import os
|
||||
from glob import glob
|
||||
from itertools import islice
|
||||
|
@ -20,9 +21,16 @@ def get_records():
|
|||
yield json.loads(line)
|
||||
|
||||
|
||||
def process(record):
|
||||
return list(fetch_process_warc_records([record]))
|
||||
|
||||
|
||||
def run():
|
||||
records = get_records()
|
||||
processed = fetch_process_warc_records(islice(records, 50))
|
||||
records = islice(get_records(), 1000)
|
||||
|
||||
with multiprocessing.Pool(20) as pool:
|
||||
processed = pool.map(process, records)
|
||||
|
||||
with gzip.open(EXTRACTS_PATH / 'data.json.gz', 'wt') as output_file:
|
||||
for row in processed:
|
||||
output_file.write(json.dumps(row) + '\n')
|
||||
|
|
Loading…
Add table
Reference in a new issue