Make it easier to rum mwmbl locally

2022-12-07 20:01:31 +00:00 · 2022-12-07 20:01:31 +00:00 · a50bc28436
commit a50bc28436
parent c0f89ba6c3
8 changed files with 37 additions and 66 deletions
--- a/2
+++ b/2
@ -41,4 +41,4 @@ VOLUME ["/data"]
 EXPOSE 5000

 # Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
-CMD ["/venv/bin/mwmbl-tinysearchengine"]
+CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background"]
--- a/README.md
+++ b/README.md
@ -120,15 +120,27 @@ Development
 ===========

 ### Using Docker
+
+WARNING: this uses production settings and creates a **very** large index (around 40Gb)
+
 1. Create a new folder called `data` in the root of the repository
-2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder
-3. Run `$ docker build . -t mwmbl`
-4. Run `$ docker run -p 8080:8080 mwmbl`
+2. Run `$ docker build . -t mwmbl`
+3. Run `$ docker run -p 8080:8080 mwmbl`

 ### Local Testing
-1. Create and activate a python (3.10) environment using any tool you like e.g. poetry,venv, conda etc.
-2. Run `$ pip install .`
-3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml`
+
+This will run against a local test database without running background
+tasks to update batches etc.
+
+This is the simplest way to configure postgres, but you can set it up
+how you like as long as the `DATABASE_URL` you give is correct for
+your configuration.
+
+1. Install postgres and create a user for your current username
+2. Install [poetry](https://python-poetry.org/docs/#installation)
+3. Run `poetry install` to install dependencies
+4. Run `poetry shell` in the root directory to enter the virtual environment
+5. Run `$ DATABASE_URL="postgres://username@" python -m mwmbl.main` replacing "username" with your username.

 Frequently Asked Question
 =========================
--- a/devdata/index-v2.tinysearch
+++ b/devdata/index-v2.tinysearch
--- a/mwmbl/indexer/batch_cache.py
+++ b/mwmbl/indexer/batch_cache.py
@ -6,6 +6,7 @@ We store them in a directory on the local machine.
 import gzip
 import json
 import os
+from logging import getLogger
 from multiprocessing.pool import ThreadPool
 from pathlib import Path
 from tempfile import NamedTemporaryFile
@ -19,6 +20,9 @@ from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
 from mwmbl.retry import retry_requests


+logger = getLogger(__name__)
+
+
 class BatchCache:
    num_threads = 20

@ -30,7 +34,11 @@ class BatchCache:
        batches = {}
        for url in batch_urls:
            path = self.get_path_from_url(url)
-            data = gzip.GzipFile(path).read()
+            try:
+                data = gzip.GzipFile(path).read()
+            except FileNotFoundError:
+                logger.exception(f"Missing batch file: {path}")
+                continue
            batch = HashedBatch.parse_raw(data)
            batches[url] = batch
        return batches
--- a/mwmbl/indexer/index_crawl.py
+++ b/mwmbl/indexer/index_crawl.py
@ -1,49 +0,0 @@
-"""
-Index data crawled through the Mwmbl crawler.
-"""
-import json
-from logging import getLogger
-
-import spacy
-
-from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
-from mwmbl.indexer.index import index_titles_urls_and_extracts
-from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
-
-
-logger = getLogger(__name__)
-
-
-def index_mwmbl_crawl_data():
-    nlp = spacy.load("en_core_web_sm")
-    titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
-    link_counts = json.load(open(LINK_COUNT_PATH))
-
-    TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
-    with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
-        index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
-
-
-def get_mwmbl_crawl_titles_urls_and_extracts():
-    input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
-    input_queue.unlock_all()
-    while True:
-        try:
-            next_item = input_queue.get()
-        except FSQueueError as e:
-            logger.exception(f'Error with item {e.item_id}')
-            input_queue.error(e.item_id)
-            continue
-        if next_item is None:
-            logger.info('Not more items to process, stopping')
-            break
-        item_id, item_data = next_item
-        logger.info(f'Processing item {item_id}')
-        for item in item_data['items']:
-            yield item['title'], item['url'], item['extract']
-        input_queue.done(item_id)
-
-
-if __name__ == '__main__':
-    index_mwmbl_crawl_data()
--- a/mwmbl/indexer/process_batch.py
+++ b/mwmbl/indexer/process_batch.py
@ -26,4 +26,4 @@ def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchSta

        process(batch_data.values())

-        index_db.update_batch_status([batch.url for batch in batches], end_status)
+        index_db.update_batch_status(list(batch_data.keys()), end_status)
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -14,7 +14,7 @@ from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
 from mwmbl.tinysearchengine.rank import HeuristicRanker

 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@ -24,9 +24,10 @@ MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'


 def setup_args():
-    parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
-    parser.add_argument("--data", help="Path to the tinysearchengine index file", default="/app/storage/")
-    parser.add_argument("--no-background", help="Disable running the background script to collect data",
+    parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
+    parser.add_argument("--num-pages", help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
+    parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
+    parser.add_argument("--background", help="Enable running the background tasks to process batches",
                        action='store_true')
    args = parser.parse_args()
    return args
@ -38,7 +39,7 @@ def run():
    index_path = Path(args.data) / INDEX_NAME
    try:
        existing_index = TinyIndex(item_factory=Document, index_path=index_path)
-        if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != NUM_PAGES:
+        if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
            print(f"Existing index page sizes ({existing_index.page_size}) and number of pages "
                  f"({existing_index.num_pages}) does not match - removing.")
            os.remove(index_path)
@ -48,11 +49,11 @@ def run():

    if existing_index is None:
        print("Creating a new index")
-        TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
+        TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)

    url_queue = Queue()

-    if not args.no_background:
+    if args.background:
        Process(target=background.run, args=(args.data, url_queue)).start()

    completer = Completer()
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@ -13,7 +13,6 @@ VERSION = 1
 METADATA_CONSTANT = b'mwmbl-tiny-search'
 METADATA_SIZE = 4096

-NUM_PAGES = 10_240_000
 PAGE_SIZE = 4096