Delete documents that have been preprocessed from the database to save space

This commit is contained in:
Daoud Clarke 2022-07-03 09:44:51 +01:00
parent f9fefa0b62
commit 9482ae5028
3 changed files with 15 additions and 0 deletions

View file

@ -2,6 +2,7 @@
Script that updates data in a background process.
"""
from logging import getLogger
from time import sleep
from mwmbl.indexer import historical
from mwmbl.indexer.preprocess import run_preprocessing
@ -26,3 +27,4 @@ def run(index_path: str):
run_update(index_path)
except Exception:
logger.exception("Error running index update")
sleep(10)

View file

@ -128,6 +128,15 @@ class IndexDatabase:
results = cursor.fetchall()
return [Document(title, url, extract, score) for url, title, extract, score in results]
def clear_documents_for_preprocessing(self) -> int:
sql = f"""
DELETE FROM documents WHERE status = {DocumentStatus.PREPROCESSING.value}
"""
with self.connection.cursor() as cursor:
cursor.execute(sql)
return cursor.rowcount
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
sql = """
INSERT INTO document_pages (url, page) values %s

View file

@ -33,6 +33,10 @@ def run_update(index_path):
break
print(f"Not enough space, adding {len(documents)}")
index_db.clear_queued_documents_for_page(i)
# All preprocessed documents should now have been indexed
# Clear documents that have now been preprocessed and indexed
num_cleared = index_db.clear_documents_for_preprocessing()
print(f"Indexed {num_cleared} documents")
def run(index_path):