Delete documents that have been preprocessed from the database to save space
This commit is contained in:
parent
f9fefa0b62
commit
9482ae5028
3 changed files with 15 additions and 0 deletions
|
@ -2,6 +2,7 @@
|
|||
Script that updates data in a background process.
|
||||
"""
|
||||
from logging import getLogger
|
||||
from time import sleep
|
||||
|
||||
from mwmbl.indexer import historical
|
||||
from mwmbl.indexer.preprocess import run_preprocessing
|
||||
|
@ -26,3 +27,4 @@ def run(index_path: str):
|
|||
run_update(index_path)
|
||||
except Exception:
|
||||
logger.exception("Error running index update")
|
||||
sleep(10)
|
||||
|
|
|
@ -128,6 +128,15 @@ class IndexDatabase:
|
|||
results = cursor.fetchall()
|
||||
return [Document(title, url, extract, score) for url, title, extract, score in results]
|
||||
|
||||
def clear_documents_for_preprocessing(self) -> int:
|
||||
sql = f"""
|
||||
DELETE FROM documents WHERE status = {DocumentStatus.PREPROCESSING.value}
|
||||
"""
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql)
|
||||
return cursor.rowcount
|
||||
|
||||
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
|
||||
sql = """
|
||||
INSERT INTO document_pages (url, page) values %s
|
||||
|
|
|
@ -33,6 +33,10 @@ def run_update(index_path):
|
|||
break
|
||||
print(f"Not enough space, adding {len(documents)}")
|
||||
index_db.clear_queued_documents_for_page(i)
|
||||
# All preprocessed documents should now have been indexed
|
||||
# Clear documents that have now been preprocessed and indexed
|
||||
num_cleared = index_db.clear_documents_for_preprocessing()
|
||||
print(f"Indexed {num_cleared} documents")
|
||||
|
||||
|
||||
def run(index_path):
|
||||
|
|
Loading…
Reference in a new issue