From a72a08a7d9cb44213c6ef2c5c203a912cf64bc36 Mon Sep 17 00:00:00 2001 From: nitred Date: Wed, 29 Dec 2021 15:18:02 +0100 Subject: [PATCH] added config and binary/entrypoint for mwmbl.tinysearchengine - using pydantic to validate the config - added a default bootstrap config at config/tinysearchengine.yaml - refactored app.py to include parsing CLI argument using argparse - refactored app.py to use fewer global variables - added "mwmbl-tinysearchengine" binary/entrypoint in pyproject.toml - updated Dockerfile to work with these changes and added comments to it --- Dockerfile | 30 ++++++++++++-------- README.md | 10 +++++-- config/tinysearchengine.yaml | 12 ++++++++ mwmbl/tinysearchengine/app.py | 47 ++++++++++++++++++++++++++++---- mwmbl/tinysearchengine/config.py | 39 ++++++++++++++++++++++++++ poetry.lock | 45 +++++++++++++++++++++++++++++- pyproject.toml | 5 ++++ 7 files changed, 168 insertions(+), 20 deletions(-) create mode 100644 config/tinysearchengine.yaml create mode 100644 mwmbl/tinysearchengine/config.py diff --git a/Dockerfile b/Dockerfile index 3f525b8..533ef6d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -13,22 +13,30 @@ ENV PIP_DEFAULT_TIMEOUT=100 \ PIP_NO_CACHE_DIR=1 \ POETRY_VERSION=1.1.12 -# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev -RUN pip install "poetry==$POETRY_VERSION" +# Create a /venv directory & environment. +# This directory will be copied into the final stage of docker build. RUN python -m venv /venv -COPY pyproject.toml poetry.lock ./ -RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin +# Copy only the necessary files to build/install the python package +COPY pyproject.toml poetry.lock /app/ +COPY mwmbl /app/mwmbl -COPY . . -RUN poetry build && /venv/bin/pip install dist/*.whl +# Working directory is /app +# Use pip to install the mwmbl python package +# PEP 518, PEP 517 and others have allowed for a standardized python packaging API, which allows +# pip to be able to install poetry packages. +RUN /venv/bin/pip install pip --upgrade && \ + /venv/bin/pip install . FROM base as final -#RUN apk add --no-cache libffi libpq +# Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies COPY --from=builder /venv /venv -COPY data /data -#COPY docker-entrypoint.sh wsgi.py ./ -#CMD ["./docker-entrypoint.sh"] -CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"] +# Working directory is /app +# Copying data and config into /app so that relative (default) paths in the config work +COPY data /app/data +COPY config /app/config + +# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl +CMD ["/venv/bin/mwmbl-tinysearchengine", "--config", "config/tinysearchengine.yaml"] diff --git a/README.md b/README.md index b5751a4..a5c2f63 100644 --- a/README.md +++ b/README.md @@ -115,9 +115,13 @@ author (email address is in the git commit history). Development =========== -Using Docker: - +### Using Docker 1. Create a new folder called `data` in the root of the repository 2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder 3. Run `$ docker build . -t mwmbl` -4. Run `$ docker run -p 8080:8080 mwmbl` \ No newline at end of file +4. Run `$ docker run -p 8080:8080 mwmbl` + +### Local Testing +1. Create and activate a python (3.9) environment using any tool you like e.g. poetry,venv, conda etc. +2. Run `$ pip install .` +3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml` \ No newline at end of file diff --git a/config/tinysearchengine.yaml b/config/tinysearchengine.yaml new file mode 100644 index 0000000..e4d29b3 --- /dev/null +++ b/config/tinysearchengine.yaml @@ -0,0 +1,12 @@ +# Config for bootstrapping tinysearchengine. +# Follows the schema/model defined by mwmbl.tinysearchengine.config.ConfigModel + +server_config: + host: "0.0.0.0" + port: 8080 + log_level: "info" + +index_config: + index_path: data/index.tinysearch + num_pages: 25600 + page_size: 4096 \ No newline at end of file diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py index daaca87..b9c5a0c 100644 --- a/mwmbl/tinysearchengine/app.py +++ b/mwmbl/tinysearchengine/app.py @@ -1,17 +1,54 @@ import logging import sys - +from typing import Optional +import argparse +from fastapi import FastAPI import uvicorn from mwmbl.tinysearchengine import create_app from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document +from mwmbl.tinysearchengine.config import parse_config_file logging.basicConfig() +app: Optional[FastAPI] = None + + +def setup_args(): + """Read all the args.""" + parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine") + parser.add_argument("--config", help="Path to tinysearchengine's yaml config.", required=True) + args = parser.parse_args() + return args + + +def main(): + """Main entrypoint for tinysearchengine. + + * Parses CLI args + * Parses and validates config + * Initializes TinyIndex + * Populates global app (FastAPI) variable so that uvicorn can run the app server + """ + args = setup_args() + config = parse_config_file(config_filename=args.config) + + # Initialize TinyIndex using index config params + tiny_index = TinyIndex( + item_factory=Document, + **config.index_config.dict() + ) + + # Update global app variable + global app + app = create_app.create(tiny_index) + + # Initialize uvicorn server using global app instance and server config params + uvicorn.run( + "mwmbl.tinysearchengine.app:app", + **config.server_config.dict() + ) -index_path = sys.argv[1] -tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE) -app = create_app.create(tiny_index) if __name__ == "__main__": - uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info") + main() diff --git a/mwmbl/tinysearchengine/config.py b/mwmbl/tinysearchengine/config.py new file mode 100644 index 0000000..c506117 --- /dev/null +++ b/mwmbl/tinysearchengine/config.py @@ -0,0 +1,39 @@ +import pathlib +import yaml +from pydantic import BaseModel, StrictInt, StrictStr, Field + + +class ServerConfigModel(BaseModel): + host: StrictStr = "0.0.0.0" + port: StrictInt = 8080 + log_level: StrictStr = "info" + + +class IndexConfigModel(BaseModel): + index_path: StrictStr = "data/index.tinysearch" + num_pages: StrictInt = 25600 + page_size: StrictInt = 4096 + + +class ConfigModel(BaseModel): + server_config: ServerConfigModel = Field(default_factory=ServerConfigModel) + index_config: IndexConfigModel = Field(default_factory=IndexConfigModel) + + +def parse_config_file(config_filename: str) -> ConfigModel: + """Parse config dictionary and return ConfigModel.""" + if not pathlib.Path(config_filename).is_file(): + raise ValueError( + f"config_filename: {config_filename} is not a file. Please check if it exists." + ) + + with open(config_filename) as f: + config = yaml.load(f, yaml.Loader) + + return ConfigModel(**config) + + +if __name__ == "__main__": + # Call this from the root of the repo using "python -m mwmbl.tinysearchengine.config" + config_model = parse_config_file(config_filename="config/tinysearchengine.yaml") + print(config_model.dict()) diff --git a/poetry.lock b/poetry.lock index 9079903..8d5b756 100644 --- a/poetry.lock +++ b/poetry.lock @@ -460,6 +460,14 @@ category = "main" optional = false python-versions = "*" +[[package]] +name = "pyyaml" +version = "6.0" +description = "YAML parser and emitter for Python" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "rapidfuzz" version = "1.8.3" @@ -794,7 +802,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx [metadata] lock-version = "1.1" python-versions = "^3.9" -content-hash = "ed117b25eca3bb999bb98376c8c4e1ff1e9288a9c3bb35c8c0b1e9233ba0b5cc" +content-hash = "b45f9def8dcadfaa6ce23560b51bdee7f81c335598f6cc84d11fd3d596e3da5b" [metadata.files] anyio = [ @@ -1359,6 +1367,41 @@ pytz = [ {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"}, {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, ] +pyyaml = [ + {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, + {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"}, + {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"}, + {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"}, + {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"}, + {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"}, + {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"}, + {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"}, + {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"}, + {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"}, + {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"}, + {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"}, + {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"}, + {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"}, + {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"}, + {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"}, + {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"}, + {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"}, + {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"}, + {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"}, + {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"}, + {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"}, + {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"}, +] rapidfuzz = [ {file = "rapidfuzz-1.8.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:0aa566e46bf1bf8e98e7a009fb0119c6601aece029af2e9566cfdf7662526c20"}, {file = "rapidfuzz-1.8.3-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:6854b2399fa39dbf480a55fe359e1012590b29e683035645dd8d56c8d367ca9b"}, diff --git a/pyproject.toml b/pyproject.toml index 8948e78..0dc2d10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,6 +11,8 @@ zstandard = "^0.16.0" mmh3 = "^3.0.0" fastapi = "^0.70.1" uvicorn = "^0.16.0" +numpy = "==1.21.1" +pyyaml = "==6.0" # Optional dependencies do not get installed by default. Look under tool.poetry.extras section # to see which extras to use. botocore = {version= "==1.23.20", optional = true} @@ -66,3 +68,6 @@ indexer = [ [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" + +[tool.poetry.scripts] +mwmbl-tinysearchengine = "mwmbl.tinysearchengine.app:main"