Don't depend on existing data
This commit is contained in:
parent
a003914e91
commit
1c7420e5fb
3 changed files with 179510 additions and 9 deletions
|
@ -33,9 +33,8 @@ FROM base as final
|
||||||
# Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies
|
# Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies
|
||||||
COPY --from=builder /venv /venv
|
COPY --from=builder /venv /venv
|
||||||
|
|
||||||
# Working directory is /app
|
# Set up a volume where the data will live
|
||||||
# Copying data and config into /app so that relative (default) paths in the config work
|
VOLUME ["/data"]
|
||||||
# COPY data /app/data
|
|
||||||
|
|
||||||
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
|
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
|
||||||
CMD ["/venv/bin/mwmbl-tinysearchengine", "--index", "data/index.tinysearch", "--terms", "data/mwmbl-crawl-terms.csv"]
|
CMD ["/venv/bin/mwmbl-tinysearchengine"]
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
import argparse
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
@ -9,17 +10,18 @@ from starlette.middleware.cors import CORSMiddleware
|
||||||
from mwmbl.crawler.app import router as crawler_router
|
from mwmbl.crawler.app import router as crawler_router
|
||||||
from mwmbl.tinysearchengine import search
|
from mwmbl.tinysearchengine import search
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
|
|
||||||
|
|
||||||
|
TERMS_PATH = Path(__file__).parent.parent / 'resources' / 'mwmbl-crawl-terms.csv'
|
||||||
|
|
||||||
|
|
||||||
def setup_args():
|
def setup_args():
|
||||||
"""Read all the args."""
|
|
||||||
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
|
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
|
||||||
parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True)
|
parser.add_argument("--index", help="Path to the tinysearchengine index file", default="/data/index.tinysearch")
|
||||||
parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True)
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
@ -35,8 +37,13 @@ def run():
|
||||||
"""
|
"""
|
||||||
args = setup_args()
|
args = setup_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
TinyIndex.create(item_factory=Document, index_path=args.index, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
|
||||||
|
except FileExistsError:
|
||||||
|
print("Index already exists")
|
||||||
|
|
||||||
# Load term data
|
# Load term data
|
||||||
terms = pd.read_csv(args.terms)
|
terms = pd.read_csv(TERMS_PATH)
|
||||||
completer = Completer(terms)
|
completer = Completer(terms)
|
||||||
|
|
||||||
with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:
|
with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:
|
||||||
|
|
179495
resources/mwmbl-crawl-terms.csv
Normal file
179495
resources/mwmbl-crawl-terms.csv
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue