Browse Source

Merge pull request #57 from mwmbl/clear-indexed-documents

Delete documents that have been preprocessed from the database to sav…
Daoud Clarke 3 years ago
parent
commit
4b5df76ca5
3 changed files with 15 additions and 0 deletions
  1. 2 0
      mwmbl/background.py
  2. 9 0
      mwmbl/indexer/indexdb.py
  3. 4 0
      mwmbl/indexer/update_pages.py

+ 2 - 0
mwmbl/background.py

@@ -2,6 +2,7 @@
 Script that updates data in a background process.
 """
 from logging import getLogger
+from time import sleep
 
 from mwmbl.indexer import historical
 from mwmbl.indexer.preprocess import run_preprocessing
@@ -26,3 +27,4 @@ def run(index_path: str):
             run_update(index_path)
         except Exception:
             logger.exception("Error running index update")
+        sleep(10)

+ 9 - 0
mwmbl/indexer/indexdb.py

@@ -128,6 +128,15 @@ class IndexDatabase:
             results = cursor.fetchall()
             return [Document(title, url, extract, score) for url, title, extract, score in results]
 
+    def clear_documents_for_preprocessing(self) -> int:
+        sql = f"""
+        DELETE FROM documents WHERE status = {DocumentStatus.PREPROCESSING.value}
+        """
+
+        with self.connection.cursor() as cursor:
+            cursor.execute(sql)
+            return cursor.rowcount
+
     def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
         sql = """
         INSERT INTO document_pages (url, page) values %s

+ 4 - 0
mwmbl/indexer/update_pages.py

@@ -33,6 +33,10 @@ def run_update(index_path):
                                 break
                             print(f"Not enough space, adding {len(documents)}")
                 index_db.clear_queued_documents_for_page(i)
+            # All preprocessed documents should now have been indexed
+            # Clear documents that have now been preprocessed and indexed
+            num_cleared = index_db.clear_documents_for_preprocessing()
+            print(f"Indexed {num_cleared} documents")
 
 
 def run(index_path):