Fixes to mwmbl API for changes to the index

This commit is contained in:
Daoud Clarke 2022-02-22 22:27:02 +00:00
parent ae3b334a7f
commit 04a33a134b
3 changed files with 12 additions and 74 deletions

View file

@ -1,12 +0,0 @@
# Config for bootstrapping tinysearchengine.
# Follows the schema/model defined by mwmbl.tinysearchengine.config.ConfigModel
server_config:
host: "0.0.0.0"
port: 8080
log_level: "info"
index_config:
index_path: data/index.tinysearch
num_pages: 76800
page_size: 4096

View file

@ -1,13 +1,12 @@
import logging
import argparse
import logging
import pandas as pd
import uvicorn
from mwmbl.tinysearchengine import create_app
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from mwmbl.tinysearchengine.config import parse_config_file
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.rank import Ranker
logging.basicConfig()
@ -16,7 +15,8 @@ logging.basicConfig()
def setup_args():
"""Read all the args."""
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
parser.add_argument("--config", help="Path to tinysearchengine's yaml config.", required=True)
parser.add_argument("--index", help="Path to the tinysearchengine index file", required=True)
parser.add_argument("--terms", help="Path to the tinysearchengine terms CSV file", required=True)
args = parser.parse_args()
return args
@ -30,30 +30,20 @@ def main():
* Initialize a FastAPI app instance
* Starts uvicorn server using app instance
"""
config, tiny_index = get_config_and_index()
args = setup_args()
# Load term data
terms = pd.read_csv(config.terms_path)
terms = pd.read_csv(args.terms)
completer = Completer(terms)
ranker = Ranker(tiny_index, completer)
with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index:
ranker = Ranker(tiny_index, completer)
# Initialize FastApi instance
app = create_app.create(ranker)
# Initialize FastApi instance
app = create_app.create(ranker)
# Initialize uvicorn server using global app instance and server config params
uvicorn.run(app, **config.server_config.dict())
def get_config_and_index():
args = setup_args()
config = parse_config_file(config_filename=args.config)
# Initialize TinyIndex using index config params
tiny_index = TinyIndex(
item_factory=Document,
**config.index_config.dict()
)
return config, tiny_index
# Initialize uvicorn server using global app instance and server config params
uvicorn.run(app, host="0.0.0.0", port=8080)
if __name__ == "__main__":

View file

@ -1,40 +0,0 @@
import pathlib
import yaml
from pydantic import BaseModel, StrictInt, StrictStr, Field
class ServerConfigModel(BaseModel):
host: StrictStr = "0.0.0.0"
port: StrictInt = 8080
log_level: StrictStr = "info"
class IndexConfigModel(BaseModel):
index_path: StrictStr = "data/index.tinysearch"
num_pages: StrictInt = 25600
page_size: StrictInt = 4096
class ConfigModel(BaseModel):
server_config: ServerConfigModel = Field(default_factory=ServerConfigModel)
index_config: IndexConfigModel = Field(default_factory=IndexConfigModel)
terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
def parse_config_file(config_filename: str) -> ConfigModel:
"""Parse config dictionary and return ConfigModel."""
if not pathlib.Path(config_filename).is_file():
raise ValueError(
f"config_filename: {config_filename} is not a file. Please check if it exists."
)
with open(config_filename) as f:
config = yaml.load(f, yaml.Loader)
return ConfigModel(**config)
if __name__ == "__main__":
# Call this from the root of the repo using "python -m mwmbl.tinysearchengine.config"
config_model = parse_config_file(config_filename="config/tinysearchengine.yaml")
print(config_model.dict())