Don't depend on existing data

2022-06-17 23:12:22 +01:00 · 2022-06-17 23:12:22 +01:00 · 1c7420e5fb
commit 1c7420e5fb
parent a003914e91
3 changed files with 179510 additions and 9 deletions
--- a/7
+++ b/7
@ -33,9 +33,8 @@ FROM base as final
 # Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies
 COPY --from=builder /venv /venv

-# Working directory is /app
-# Copying data and config into /app so that relative (default) paths in the config work
-# COPY data /app/data
+# Set up a volume where the data will live
+VOLUME ["/data"]

 # Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
-CMD ["/venv/bin/mwmbl-tinysearchengine", "--index", "data/index.tinysearch", "--terms", "data/mwmbl-crawl-terms.csv"]
+CMD ["/venv/bin/mwmbl-tinysearchengine"]
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -1,5 +1,6 @@
 import argparse
 import logging
+from pathlib import Path

 import pandas as pd
 import uvicorn
@ -9,17 +10,18 @@ from starlette.middleware.cors import CORSMiddleware
 from mwmbl.crawler.app import router as crawler_router
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 from mwmbl.tinysearchengine.rank import HeuristicRanker

 logging.basicConfig()


+TERMS_PATH = Path(__file__).parent.parent / 'resources' / 'mwmbl-crawl-terms.csv'
+
+
 def setup_args():
-    """Read all the args."""
    parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
-    parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True)
-    parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True)
+    parser.add_argument("--index", help="Path to the tinysearchengine index file", default="/data/index.tinysearch")
    args = parser.parse_args()
    return args

@ -35,8 +37,13 @@ def run():
    """
    args = setup_args()

+    try:
+        TinyIndex.create(item_factory=Document, index_path=args.index, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
+    except FileExistsError:
+        print("Index already exists")
+
    # Load term data
-    terms = pd.read_csv(args.terms)
+    terms = pd.read_csv(TERMS_PATH)
    completer = Completer(terms)

    with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:
--- a/resources/mwmbl-crawl-terms.csv
+++ b/resources/mwmbl-crawl-terms.csv