|
@@ -3,6 +3,7 @@ Retrieve remote batches and store them in Postgres locally
|
|
"""
|
|
"""
|
|
import gzip
|
|
import gzip
|
|
import json
|
|
import json
|
|
|
|
+import traceback
|
|
from multiprocessing.pool import ThreadPool
|
|
from multiprocessing.pool import ThreadPool
|
|
from time import sleep
|
|
from time import sleep
|
|
|
|
|
|
@@ -10,10 +11,11 @@ import requests
|
|
|
|
|
|
from mwmbl.crawler.app import HashedBatch
|
|
from mwmbl.crawler.app import HashedBatch
|
|
from mwmbl.database import Database
|
|
from mwmbl.database import Database
|
|
-from mwmbl.indexdb import IndexDatabase, BatchStatus
|
|
|
|
|
|
+from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
|
|
|
|
+from mwmbl.retry import retry_requests
|
|
from mwmbl.tinysearchengine.indexer import Document
|
|
from mwmbl.tinysearchengine.indexer import Document
|
|
|
|
|
|
-NUM_THREADS = 10
|
|
|
|
|
|
+NUM_THREADS = 5
|
|
|
|
|
|
|
|
|
|
def retrieve_batches():
|
|
def retrieve_batches():
|
|
@@ -25,7 +27,7 @@ def retrieve_batches():
|
|
index_db = IndexDatabase(db.connection)
|
|
index_db = IndexDatabase(db.connection)
|
|
batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
|
|
batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
|
|
print("Batches", batches)
|
|
print("Batches", batches)
|
|
- urls = [batch.url for batch in batches][:10]
|
|
|
|
|
|
+ urls = [batch.url for batch in batches]
|
|
pool = ThreadPool(NUM_THREADS)
|
|
pool = ThreadPool(NUM_THREADS)
|
|
results = pool.imap_unordered(retrieve_batch, urls)
|
|
results = pool.imap_unordered(retrieve_batch, urls)
|
|
for result in results:
|
|
for result in results:
|
|
@@ -37,13 +39,13 @@ def retrieve_batches():
|
|
|
|
|
|
|
|
|
|
def retrieve_batch(url):
|
|
def retrieve_batch(url):
|
|
- data = json.loads(gzip.decompress(requests.get(url).content))
|
|
|
|
|
|
+ data = json.loads(gzip.decompress(retry_requests.get(url).content))
|
|
batch = HashedBatch.parse_obj(data)
|
|
batch = HashedBatch.parse_obj(data)
|
|
- queue_batch(url, batch)
|
|
|
|
|
|
+ queue_batch(batch)
|
|
return len(batch.items)
|
|
return len(batch.items)
|
|
|
|
|
|
|
|
|
|
-def queue_batch(batch_url: str, batch: HashedBatch):
|
|
|
|
|
|
+def queue_batch(batch: HashedBatch):
|
|
# TODO get the score from the URLs database
|
|
# TODO get the score from the URLs database
|
|
documents = [Document(item.content.title, item.url, item.content.extract, 1)
|
|
documents = [Document(item.content.title, item.url, item.content.extract, 1)
|
|
for item in batch.items if item.content is not None]
|
|
for item in batch.items if item.content is not None]
|
|
@@ -54,7 +56,11 @@ def queue_batch(batch_url: str, batch: HashedBatch):
|
|
|
|
|
|
def run():
|
|
def run():
|
|
while True:
|
|
while True:
|
|
- retrieve_batches()
|
|
|
|
|
|
+ try:
|
|
|
|
+ retrieve_batches()
|
|
|
|
+ except Exception as e:
|
|
|
|
+ print("Exception retrieving batch")
|
|
|
|
+ traceback.print_exception(type(e), e, e.__traceback__)
|
|
sleep(10)
|
|
sleep(10)
|
|
|
|
|
|
|
|
|