Don't depend on existing data

This commit is contained in:
Daoud Clarke 2022-06-17 23:12:22 +01:00
parent a003914e91
commit 1c7420e5fb
3 changed files with 179510 additions and 9 deletions

View file

@ -33,9 +33,8 @@ FROM base as final
# Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies
COPY --from=builder /venv /venv
# Working directory is /app
# Copying data and config into /app so that relative (default) paths in the config work
# COPY data /app/data
# Set up a volume where the data will live
VOLUME ["/data"]
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
CMD ["/venv/bin/mwmbl-tinysearchengine", "--index", "data/index.tinysearch", "--terms", "data/mwmbl-crawl-terms.csv"]
CMD ["/venv/bin/mwmbl-tinysearchengine"]

View file

@ -1,5 +1,6 @@
import argparse
import logging
from pathlib import Path
import pandas as pd
import uvicorn
@ -9,17 +10,18 @@ from starlette.middleware.cors import CORSMiddleware
from mwmbl.crawler.app import router as crawler_router
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig()
TERMS_PATH = Path(__file__).parent.parent / 'resources' / 'mwmbl-crawl-terms.csv'
def setup_args():
"""Read all the args."""
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True)
parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True)
parser.add_argument("--index", help="Path to the tinysearchengine index file", default="/data/index.tinysearch")
args = parser.parse_args()
return args
@ -35,8 +37,13 @@ def run():
"""
args = setup_args()
try:
TinyIndex.create(item_factory=Document, index_path=args.index, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
except FileExistsError:
print("Index already exists")
# Load term data
terms = pd.read_csv(args.terms)
terms = pd.read_csv(TERMS_PATH)
completer = Completer(terms)
with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:

179495
resources/mwmbl-crawl-terms.csv Normal file

File diff suppressed because it is too large Load diff