diff --git a/Dockerfile b/Dockerfile index 094b6ab..7377746 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,4 +41,4 @@ VOLUME ["/data"] EXPOSE 5000 # Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl -CMD ["/venv/bin/mwmbl-tinysearchengine"] +CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background"] diff --git a/README.md b/README.md index d45e819..3b15282 100644 --- a/README.md +++ b/README.md @@ -120,15 +120,27 @@ Development =========== ### Using Docker + +WARNING: this uses production settings and creates a **very** large index (around 40Gb) + 1. Create a new folder called `data` in the root of the repository -2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder -3. Run `$ docker build . -t mwmbl` -4. Run `$ docker run -p 8080:8080 mwmbl` +2. Run `$ docker build . -t mwmbl` +3. Run `$ docker run -p 8080:8080 mwmbl` ### Local Testing -1. Create and activate a python (3.10) environment using any tool you like e.g. poetry,venv, conda etc. -2. Run `$ pip install .` -3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml` + +This will run against a local test database without running background +tasks to update batches etc. + +This is the simplest way to configure postgres, but you can set it up +how you like as long as the `DATABASE_URL` you give is correct for +your configuration. + +1. Install postgres and create a user for your current username +2. Install [poetry](https://python-poetry.org/docs/#installation) +3. Run `poetry install` to install dependencies +4. Run `poetry shell` in the root directory to enter the virtual environment +5. Run `$ DATABASE_URL="postgres://username@" python -m mwmbl.main` replacing "username" with your username. Frequently Asked Question ========================= diff --git a/devdata/index-v2.tinysearch b/devdata/index-v2.tinysearch new file mode 100644 index 0000000..f00dfd6 Binary files /dev/null and b/devdata/index-v2.tinysearch differ diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py index e173e23..8081c28 100644 --- a/mwmbl/indexer/batch_cache.py +++ b/mwmbl/indexer/batch_cache.py @@ -6,6 +6,7 @@ We store them in a directory on the local machine. import gzip import json import os +from logging import getLogger from multiprocessing.pool import ThreadPool from pathlib import Path from tempfile import NamedTemporaryFile @@ -19,6 +20,9 @@ from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus from mwmbl.retry import retry_requests +logger = getLogger(__name__) + + class BatchCache: num_threads = 20 @@ -30,7 +34,11 @@ class BatchCache: batches = {} for url in batch_urls: path = self.get_path_from_url(url) - data = gzip.GzipFile(path).read() + try: + data = gzip.GzipFile(path).read() + except FileNotFoundError: + logger.exception(f"Missing batch file: {path}") + continue batch = HashedBatch.parse_raw(data) batches[url] = batch return batches diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py deleted file mode 100644 index a22f0d0..0000000 --- a/mwmbl/indexer/index_crawl.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Index data crawled through the Mwmbl crawler. -""" -import json -from logging import getLogger - -import spacy - -from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError -from mwmbl.indexer.index import index_titles_urls_and_extracts -from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE - - -logger = getLogger(__name__) - - -def index_mwmbl_crawl_data(): - nlp = spacy.load("en_core_web_sm") - titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts() - link_counts = json.load(open(LINK_COUNT_PATH)) - - TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) - with TinyIndex(Document, INDEX_PATH, 'w') as indexer: - index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH) - - -def get_mwmbl_crawl_titles_urls_and_extracts(): - input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer()) - input_queue.unlock_all() - while True: - try: - next_item = input_queue.get() - except FSQueueError as e: - logger.exception(f'Error with item {e.item_id}') - input_queue.error(e.item_id) - continue - if next_item is None: - logger.info('Not more items to process, stopping') - break - item_id, item_data = next_item - logger.info(f'Processing item {item_id}') - for item in item_data['items']: - yield item['title'], item['url'], item['extract'] - input_queue.done(item_id) - - -if __name__ == '__main__': - index_mwmbl_crawl_data() diff --git a/mwmbl/indexer/process_batch.py b/mwmbl/indexer/process_batch.py index f9e12a7..f8a67bd 100644 --- a/mwmbl/indexer/process_batch.py +++ b/mwmbl/indexer/process_batch.py @@ -26,4 +26,4 @@ def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchSta process(batch_data.values()) - index_db.update_batch_status([batch.url for batch in batches], end_status) + index_db.update_batch_status(list(batch_data.keys()), end_status) diff --git a/mwmbl/main.py b/mwmbl/main.py index d1231d9..d5b450f 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -14,7 +14,7 @@ from mwmbl.indexer.batch_cache import BatchCache from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME from mwmbl.tinysearchengine import search from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE from mwmbl.tinysearchengine.rank import HeuristicRanker logging.basicConfig(stream=sys.stdout, level=logging.INFO) @@ -24,9 +24,10 @@ MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle' def setup_args(): - parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine") - parser.add_argument("--data", help="Path to the tinysearchengine index file", default="/app/storage/") - parser.add_argument("--no-background", help="Disable running the background script to collect data", + parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor") + parser.add_argument("--num-pages", help="Number of pages of memory (4096 bytes) to use for the index", default=2560) + parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata") + parser.add_argument("--background", help="Enable running the background tasks to process batches", action='store_true') args = parser.parse_args() return args @@ -38,7 +39,7 @@ def run(): index_path = Path(args.data) / INDEX_NAME try: existing_index = TinyIndex(item_factory=Document, index_path=index_path) - if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != NUM_PAGES: + if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages: print(f"Existing index page sizes ({existing_index.page_size}) and number of pages " f"({existing_index.num_pages}) does not match - removing.") os.remove(index_path) @@ -48,11 +49,11 @@ def run(): if existing_index is None: print("Creating a new index") - TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=NUM_PAGES, page_size=PAGE_SIZE) + TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE) url_queue = Queue() - if not args.no_background: + if args.background: Process(target=background.run, args=(args.data, url_queue)).start() completer = Completer() diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 253faab..8826426 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -13,7 +13,6 @@ VERSION = 1 METADATA_CONSTANT = b'mwmbl-tiny-search' METADATA_SIZE = 4096 -NUM_PAGES = 10_240_000 PAGE_SIZE = 4096