From 218d87365442ec9ae2e97e1428ca8a946804bd62 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 30 Jul 2022 09:27:44 +0100 Subject: [PATCH] Delete unused SQL --- mwmbl/indexer/historical.py | 2 +- mwmbl/indexer/index_batches.py | 2 +- mwmbl/indexer/indexdb.py | 29 ----------------------------- 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/mwmbl/indexer/historical.py b/mwmbl/indexer/historical.py index 8398f0a..9386680 100644 --- a/mwmbl/indexer/historical.py +++ b/mwmbl/indexer/historical.py @@ -4,7 +4,7 @@ from mwmbl.crawler.app import get_batches_for_date from mwmbl.database import Database from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase -DAYS = 10 +DAYS = 20 def run(): diff --git a/mwmbl/indexer/index_batches.py b/mwmbl/indexer/index_batches.py index 8256bef..eabc567 100644 --- a/mwmbl/indexer/index_batches.py +++ b/mwmbl/indexer/index_batches.py @@ -36,7 +36,7 @@ def run(batch_cache: BatchCache, index_path: str): index_db = IndexDatabase(db.connection) logger.info("Getting local batches") - batches = index_db.get_batches_by_status(BatchStatus.LOCAL, 1000) + batches = index_db.get_batches_by_status(BatchStatus.LOCAL, 10000) logger.info(f"Got {len(batches)} batch urls") if len(batches) == 0: return diff --git a/mwmbl/indexer/indexdb.py b/mwmbl/indexer/indexdb.py index 2c02317..8bff0ec 100644 --- a/mwmbl/indexer/indexdb.py +++ b/mwmbl/indexer/indexdb.py @@ -13,11 +13,6 @@ class BatchStatus(Enum): INDEXED = 2 -class DocumentStatus(Enum): - NEW = 0 - PREPROCESSING = 1 - - @dataclass class BatchInfo: url: str @@ -38,32 +33,8 @@ class IndexDatabase: ) """ - documents_sql = """ - CREATE TABLE IF NOT EXISTS documents ( - url VARCHAR PRIMARY KEY, - title VARCHAR NOT NULL, - extract VARCHAR NOT NULL, - score FLOAT NOT NULL, - status INT NOT NULL - ) - """ - - document_pages_sql = """ - CREATE TABLE IF NOT EXISTS document_pages ( - url VARCHAR NOT NULL, - page INT NOT NULL - ) - """ - - document_pages_index_sql = """ - CREATE INDEX IF NOT EXISTS document_pages_page_index ON document_pages (page) - """ - with self.connection.cursor() as cursor: cursor.execute(batches_sql) - cursor.execute(documents_sql) - cursor.execute(document_pages_sql) - cursor.execute(document_pages_index_sql) def record_batches(self, batch_infos: list[BatchInfo]): sql = """