From 218d87365442ec9ae2e97e1428ca8a946804bd62 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 30 Jul 2022 09:27:44 +0100 Subject: [PATCH] Delete unused SQL --- mwmbl/indexer/historical.py | 2 +- mwmbl/indexer/index_batches.py | 2 +- mwmbl/indexer/indexdb.py | 29 ----------------------------- 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/mwmbl/indexer/historical.py b/mwmbl/indexer/historical.py index 8398f0a3f8ae9f0093610737e70f0fe332367cde..938668041a1cb46be3d123ec6142c281cca3ebf2 100644 --- a/mwmbl/indexer/historical.py +++ b/mwmbl/indexer/historical.py @@ -4,7 +4,7 @@ from mwmbl.crawler.app import get_batches_for_date from mwmbl.database import Database from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase -DAYS = 10 +DAYS = 20 def run(): diff --git a/mwmbl/indexer/index_batches.py b/mwmbl/indexer/index_batches.py index 8256befa9ba294720220a8bf48982d2836007a4a..eabc567e3c777156f5d1241c296cba4cf831570f 100644 --- a/mwmbl/indexer/index_batches.py +++ b/mwmbl/indexer/index_batches.py @@ -36,7 +36,7 @@ def run(batch_cache: BatchCache, index_path: str): index_db = IndexDatabase(db.connection) logger.info("Getting local batches") - batches = index_db.get_batches_by_status(BatchStatus.LOCAL, 1000) + batches = index_db.get_batches_by_status(BatchStatus.LOCAL, 10000) logger.info(f"Got {len(batches)} batch urls") if len(batches) == 0: return diff --git a/mwmbl/indexer/indexdb.py b/mwmbl/indexer/indexdb.py index 2c023173f0cd5429a59171f04a7ed8fedbab10fc..8bff0ec27685dba4942a750320868202e4a9acfe 100644 --- a/mwmbl/indexer/indexdb.py +++ b/mwmbl/indexer/indexdb.py @@ -13,11 +13,6 @@ class BatchStatus(Enum): INDEXED = 2 -class DocumentStatus(Enum): - NEW = 0 - PREPROCESSING = 1 - - @dataclass class BatchInfo: url: str @@ -38,32 +33,8 @@ class IndexDatabase: ) """ - documents_sql = """ - CREATE TABLE IF NOT EXISTS documents ( - url VARCHAR PRIMARY KEY, - title VARCHAR NOT NULL, - extract VARCHAR NOT NULL, - score FLOAT NOT NULL, - status INT NOT NULL - ) - """ - - document_pages_sql = """ - CREATE TABLE IF NOT EXISTS document_pages ( - url VARCHAR NOT NULL, - page INT NOT NULL - ) - """ - - document_pages_index_sql = """ - CREATE INDEX IF NOT EXISTS document_pages_page_index ON document_pages (page) - """ - with self.connection.cursor() as cursor: cursor.execute(batches_sql) - cursor.execute(documents_sql) - cursor.execute(document_pages_sql) - cursor.execute(document_pages_index_sql) def record_batches(self, batch_infos: list[BatchInfo]): sql = """