il y a 3 ans · 2b52b50569
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@@ -17,6 +17,8 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 
				 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
			
 
				 from mwmbl.database import Database
			
 
				 from mwmbl.hn_top_domains_filtered import DOMAINS
			
 
				+from mwmbl.indexer.indexdb import IndexDatabase
			
 
				+from mwmbl.tinysearchengine.indexer import Document
			
 
				 
			
 
				 APPLICATION_KEY = os.environ['MWMBL_APPLICATION_KEY']
			
 
				 KEY_ID = os.environ['MWMBL_KEY_ID']
			
@@ -93,6 +95,7 @@ def create_batch(batch: Batch):
 
				     upload(data, filename)
			
 
				 
			
 
				     record_urls_in_database(batch, user_id_hash, now)
			
 
				+    queue_batch(batch)
			
 
				 
			
 
				     global last_batch
			
 
				     last_batch = hashed_batch
			
@@ -271,3 +274,13 @@ def status():
 
				     return {
			
 
				         'status': 'ok'
			
 
				     }
			
 
				+
			
 
				+
			
 
				+def queue_batch(batch: HashedBatch):
			
 
				+    # TODO: get the score from the URLs database
			
 
				+    # TODO: also queue documents for batches sent through the API
			
 
				+    documents = [Document(item.content.title, item.url, item.content.extract, 1)
			
 
				+                 for item in batch.items if item.content is not None]
			
 
				+    with Database() as db:
			
 
				+        index_db = IndexDatabase(db.connection)
			
 
				+        index_db.queue_documents(documents)
			
--- a/mwmbl/crawler/urls.py
+++ b/mwmbl/crawler/urls.py
@@ -96,7 +96,7 @@ class URLDatabase:
 
				                 urls_to_insert = new_urls | locked_urls
			
 
				 
			
 
				                 if len(urls_to_insert) != len(input_urls):
			
 
				-                    print(f"Only got {len(locked_urls)} instead of {len(input_urls)}")
			
 
				+                    print(f"Only got {len(locked_urls)} instead of {len(input_urls)} - {len(new_urls)} new")
			
 
				 
			
 
				                 sorted_urls = sorted(found_urls, key=lambda x: x.url)
			
 
				                 data = [
			
--- a/mwmbl/indexer/retrieve.py
+++ b/mwmbl/indexer/retrieve.py
@@ -7,15 +7,13 @@ import traceback
 
				 from multiprocessing.pool import ThreadPool
			
 
				 from time import sleep
			
 
				 
			
 
				-import requests
			
 
				 from pydantic import ValidationError
			
 
				 
			
 
				-from mwmbl.crawler.app import create_historical_batch
			
 
				+from mwmbl.crawler.app import create_historical_batch, queue_batch
			
 
				 from mwmbl.crawler.batch import HashedBatch
			
 
				 from mwmbl.database import Database
			
 
				 from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
			
 
				 from mwmbl.retry import retry_requests
			
 
				-from mwmbl.tinysearchengine.indexer import Document
			
 
				 
			
 
				 NUM_THREADS = 5
			
 
				 
			
@@ -56,16 +54,6 @@ def retrieve_batch(url):
 
				     return len(batch.items)
			
 
				 
			
 
				 
			
 
				-def queue_batch(batch: HashedBatch):
			
 
				-    # TODO: get the score from the URLs database
			
 
				-    # TODO: also queue documents for batches sent through the API
			
 
				-    documents = [Document(item.content.title, item.url, item.content.extract, 1)
			
 
				-                 for item in batch.items if item.content is not None]
			
 
				-    with Database() as db:
			
 
				-        index_db = IndexDatabase(db.connection)
			
 
				-        index_db.queue_documents(documents)
			
 
				-
			
 
				-
			
 
				 def run():
			
 
				     while True:
			
 
				         try: