Run multiple processes in parallel
This commit is contained in:
parent
34dc50a6ed
commit
16a8356a23
2 changed files with 35 additions and 7 deletions
|
@ -2,7 +2,9 @@ import gzip
|
|||
import json
|
||||
import os
|
||||
from glob import glob
|
||||
from multiprocessing import Process, Lock
|
||||
from pathlib import Path
|
||||
from time import sleep
|
||||
|
||||
from extract_process import fetch_process_warc_records
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer
|
||||
|
@ -12,6 +14,8 @@ EXTRACTS_PATH = DATA_DIR / 'extracts'
|
|||
|
||||
ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
|
||||
|
||||
NUM_PROCESSES = 8
|
||||
|
||||
|
||||
def get_records():
|
||||
for path in glob(ARCHIVE_INFO_GLOB):
|
||||
|
@ -25,17 +29,18 @@ def process(record):
|
|||
return list(fetch_process_warc_records([record]))
|
||||
|
||||
|
||||
def run():
|
||||
def run(lock: Lock):
|
||||
input_queue = FSQueue(DATA_DIR, 'records', GzipJsonRowSerializer())
|
||||
output_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer())
|
||||
|
||||
input_queue.unlock_all()
|
||||
|
||||
while True:
|
||||
queue_item = input_queue.get()
|
||||
with lock:
|
||||
queue_item = input_queue.get()
|
||||
if queue_item is None:
|
||||
print("All finished, stopping:", os.getpid())
|
||||
break
|
||||
item_id, records = queue_item
|
||||
print("Got item: ", item_id, os.getpid())
|
||||
search_items = []
|
||||
for record in records:
|
||||
search_items += list(fetch_process_warc_records([record]))
|
||||
|
@ -44,5 +49,19 @@ def run():
|
|||
input_queue.done(item_id)
|
||||
|
||||
|
||||
def run_multiprocessing():
|
||||
input_queue = FSQueue(DATA_DIR, 'records', GzipJsonRowSerializer())
|
||||
input_queue.unlock_all()
|
||||
processes = []
|
||||
lock = Lock()
|
||||
for i in range(NUM_PROCESSES):
|
||||
new_process = Process(target=run, args=(lock,))
|
||||
new_process.start()
|
||||
processes.append(new_process)
|
||||
|
||||
for running_process in processes:
|
||||
running_process.join()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
run_multiprocessing()
|
||||
|
|
13
fsqueue.py
13
fsqueue.py
|
@ -91,13 +91,22 @@ class FSQueue:
|
|||
Get the next priority item from the queue, returning the item ID and the object
|
||||
"""
|
||||
|
||||
paths = sorted(Path(self._get_dir(FSState.READY)).iterdir(), key=os.path.getmtime)
|
||||
directory = self._get_dir(FSState.READY)
|
||||
print("Getting directory", directory)
|
||||
paths = list(Path(directory).iterdir())
|
||||
print("Top paths", paths[:10])
|
||||
|
||||
for path in paths:
|
||||
# Try and lock the file
|
||||
self._move(path.name, FSState.READY, FSState.LOCKED)
|
||||
try:
|
||||
print("Moving file", path.name)
|
||||
self._move(path.name, FSState.READY, FSState.LOCKED)
|
||||
except FileNotFoundError:
|
||||
print("File not found", path.name)
|
||||
continue
|
||||
|
||||
with open(self._get_path(FSState.LOCKED, path.name), 'rb') as item_file:
|
||||
print("Opening file", path.name)
|
||||
return path.name, self.serializer.deserialize(item_file.read())
|
||||
|
||||
def done(self, item_id: str):
|
||||
|
|
Loading…
Add table
Reference in a new issue