Browse Source

Merge pull request #24 from nitred/config-and-entrypoint

added config and binary/entrypoint for mwmbl.tinysearchengine
Daoud Clarke 3 years ago
parent
commit
3d7e655ebc
7 changed files with 168 additions and 20 deletions
  1. 19 11
      Dockerfile
  2. 7 3
      README.md
  3. 12 0
      config/tinysearchengine.yaml
  4. 42 5
      mwmbl/tinysearchengine/app.py
  5. 39 0
      mwmbl/tinysearchengine/config.py
  6. 44 1
      poetry.lock
  7. 5 0
      pyproject.toml

+ 19 - 11
Dockerfile

@@ -13,22 +13,30 @@ ENV PIP_DEFAULT_TIMEOUT=100 \
     PIP_NO_CACHE_DIR=1 \
     PIP_NO_CACHE_DIR=1 \
     POETRY_VERSION=1.1.12
     POETRY_VERSION=1.1.12
 
 
-# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
-RUN pip install "poetry==$POETRY_VERSION"
+# Create a /venv directory & environment.
+# This directory will be copied into the final stage of docker build.
 RUN python -m venv /venv
 RUN python -m venv /venv
 
 
-COPY pyproject.toml poetry.lock ./
-RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
+# Copy only the necessary files to build/install the python package
+COPY pyproject.toml poetry.lock /app/
+COPY mwmbl /app/mwmbl
 
 
-COPY . .
-RUN poetry build && /venv/bin/pip install dist/*.whl
+# Working directory is /app
+# Use pip to install the mwmbl python package
+# PEP 518, PEP 517 and others have allowed for a standardized python packaging API, which allows
+# pip to be able to install poetry packages.
+RUN /venv/bin/pip install pip --upgrade && \
+    /venv/bin/pip install .
 
 
 FROM base as final
 FROM base as final
 
 
-#RUN apk add --no-cache libffi libpq
+# Copy only the required /venv directory from the builder image that contains mwmbl and its dependencies
 COPY --from=builder /venv /venv
 COPY --from=builder /venv /venv
-COPY data /data
-#COPY docker-entrypoint.sh wsgi.py ./
-#CMD ["./docker-entrypoint.sh"]
 
 
-CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"]
+# Working directory is /app
+# Copying data and config into /app so that relative (default) paths in the config work
+COPY data /app/data
+COPY config /app/config
+
+# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
+CMD ["/venv/bin/mwmbl-tinysearchengine", "--config",  "config/tinysearchengine.yaml"]

+ 7 - 3
README.md

@@ -115,9 +115,13 @@ author (email address is in the git commit history).
 Development
 Development
 ===========
 ===========
 
 
-Using Docker:
-
+### Using Docker
 1. Create a new folder called `data` in the root of the repository
 1. Create a new folder called `data` in the root of the repository
 2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder
 2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder
 3. Run `$ docker build . -t mwmbl`
 3. Run `$ docker build . -t mwmbl`
-4. Run `$ docker run -p 8080:8080 mwmbl`
+4. Run `$ docker run -p 8080:8080 mwmbl`
+
+### Local Testing
+1. Create and activate a python (3.9) environment using any tool you like e.g. poetry,venv, conda etc.
+2. Run `$ pip install .`
+3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml`

+ 12 - 0
config/tinysearchengine.yaml

@@ -0,0 +1,12 @@
+# Config for bootstrapping tinysearchengine.
+# Follows the schema/model defined by mwmbl.tinysearchengine.config.ConfigModel
+
+server_config:
+  host: "0.0.0.0"
+  port: 8080
+  log_level: "info"
+
+index_config:
+  index_path: data/index.tinysearch
+  num_pages: 25600
+  page_size: 4096

+ 42 - 5
mwmbl/tinysearchengine/app.py

@@ -1,17 +1,54 @@
 import logging
 import logging
 import sys
 import sys
-
+from typing import Optional
+import argparse
+from fastapi import FastAPI
 import uvicorn
 import uvicorn
 
 
 from mwmbl.tinysearchengine import create_app
 from mwmbl.tinysearchengine import create_app
 from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
 from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
+from mwmbl.tinysearchengine.config import parse_config_file
 
 
 logging.basicConfig()
 logging.basicConfig()
 
 
+app: Optional[FastAPI] = None
+
+
+def setup_args():
+    """Read all the args."""
+    parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
+    parser.add_argument("--config", help="Path to tinysearchengine's yaml config.", required=True)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    """Main entrypoint for tinysearchengine.
+
+    * Parses CLI args
+    * Parses and validates config
+    * Initializes TinyIndex
+    * Populates global app (FastAPI) variable so that uvicorn can run the app server
+    """
+    args = setup_args()
+    config = parse_config_file(config_filename=args.config)
+
+    # Initialize TinyIndex using index config params
+    tiny_index = TinyIndex(
+        item_factory=Document,
+        **config.index_config.dict()
+    )
+
+    # Update global app variable
+    global app
+    app = create_app.create(tiny_index)
+
+    # Initialize uvicorn server using global app instance and server config params
+    uvicorn.run(
+        "mwmbl.tinysearchengine.app:app",
+        **config.server_config.dict()
+    )
 
 
-index_path = sys.argv[1]
-tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
-app = create_app.create(tiny_index)
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
-    uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")
+    main()

+ 39 - 0
mwmbl/tinysearchengine/config.py

@@ -0,0 +1,39 @@
+import pathlib
+import yaml
+from pydantic import BaseModel, StrictInt, StrictStr, Field
+
+
+class ServerConfigModel(BaseModel):
+    host: StrictStr = "0.0.0.0"
+    port: StrictInt = 8080
+    log_level: StrictStr = "info"
+
+
+class IndexConfigModel(BaseModel):
+    index_path: StrictStr = "data/index.tinysearch"
+    num_pages: StrictInt = 25600
+    page_size: StrictInt = 4096
+
+
+class ConfigModel(BaseModel):
+    server_config: ServerConfigModel = Field(default_factory=ServerConfigModel)
+    index_config: IndexConfigModel = Field(default_factory=IndexConfigModel)
+
+
+def parse_config_file(config_filename: str) -> ConfigModel:
+    """Parse config dictionary and return ConfigModel."""
+    if not pathlib.Path(config_filename).is_file():
+        raise ValueError(
+            f"config_filename: {config_filename} is not a file. Please check if it exists."
+        )
+
+    with open(config_filename) as f:
+        config = yaml.load(f, yaml.Loader)
+
+    return ConfigModel(**config)
+
+
+if __name__ == "__main__":
+    # Call this from the root of the repo using "python -m mwmbl.tinysearchengine.config"
+    config_model = parse_config_file(config_filename="config/tinysearchengine.yaml")
+    print(config_model.dict())

+ 44 - 1
poetry.lock

@@ -460,6 +460,14 @@ category = "main"
 optional = false
 optional = false
 python-versions = "*"
 python-versions = "*"
 
 
+[[package]]
+name = "pyyaml"
+version = "6.0"
+description = "YAML parser and emitter for Python"
+category = "main"
+optional = false
+python-versions = ">=3.6"
+
 [[package]]
 [[package]]
 name = "rapidfuzz"
 name = "rapidfuzz"
 version = "1.8.3"
 version = "1.8.3"
@@ -794,7 +802,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
 [metadata]
 [metadata]
 lock-version = "1.1"
 lock-version = "1.1"
 python-versions = "^3.9"
 python-versions = "^3.9"
-content-hash = "ed117b25eca3bb999bb98376c8c4e1ff1e9288a9c3bb35c8c0b1e9233ba0b5cc"
+content-hash = "b45f9def8dcadfaa6ce23560b51bdee7f81c335598f6cc84d11fd3d596e3da5b"
 
 
 [metadata.files]
 [metadata.files]
 anyio = [
 anyio = [
@@ -1359,6 +1367,41 @@ pytz = [
     {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"},
     {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"},
     {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
     {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"},
 ]
 ]
+pyyaml = [
+    {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"},
+    {file = "PyYAML-6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b"},
+    {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
+    {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
+    {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
+    {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
+    {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4"},
+    {file = "PyYAML-6.0-cp36-cp36m-win32.whl", hash = "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293"},
+    {file = "PyYAML-6.0-cp36-cp36m-win_amd64.whl", hash = "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57"},
+    {file = "PyYAML-6.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4"},
+    {file = "PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9"},
+    {file = "PyYAML-6.0-cp37-cp37m-win32.whl", hash = "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737"},
+    {file = "PyYAML-6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d"},
+    {file = "PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34"},
+    {file = "PyYAML-6.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287"},
+    {file = "PyYAML-6.0-cp38-cp38-win32.whl", hash = "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78"},
+    {file = "PyYAML-6.0-cp38-cp38-win_amd64.whl", hash = "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b"},
+    {file = "PyYAML-6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3"},
+    {file = "PyYAML-6.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0"},
+    {file = "PyYAML-6.0-cp39-cp39-win32.whl", hash = "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb"},
+    {file = "PyYAML-6.0-cp39-cp39-win_amd64.whl", hash = "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c"},
+    {file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
+]
 rapidfuzz = [
 rapidfuzz = [
     {file = "rapidfuzz-1.8.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:0aa566e46bf1bf8e98e7a009fb0119c6601aece029af2e9566cfdf7662526c20"},
     {file = "rapidfuzz-1.8.3-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:0aa566e46bf1bf8e98e7a009fb0119c6601aece029af2e9566cfdf7662526c20"},
     {file = "rapidfuzz-1.8.3-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:6854b2399fa39dbf480a55fe359e1012590b29e683035645dd8d56c8d367ca9b"},
     {file = "rapidfuzz-1.8.3-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:6854b2399fa39dbf480a55fe359e1012590b29e683035645dd8d56c8d367ca9b"},

+ 5 - 0
pyproject.toml

@@ -11,6 +11,8 @@ zstandard = "^0.16.0"
 mmh3 = "^3.0.0"
 mmh3 = "^3.0.0"
 fastapi = "^0.70.1"
 fastapi = "^0.70.1"
 uvicorn = "^0.16.0"
 uvicorn = "^0.16.0"
+numpy = "==1.21.1"
+pyyaml = "==6.0"
 # Optional dependencies do not get installed by default. Look under tool.poetry.extras section
 # Optional dependencies do not get installed by default. Look under tool.poetry.extras section
 # to see which extras to use.
 # to see which extras to use.
 botocore = {version= "==1.23.20", optional = true}
 botocore = {version= "==1.23.20", optional = true}
@@ -66,3 +68,6 @@ indexer = [
 [build-system]
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 build-backend = "poetry.core.masonry.api"
+
+[tool.poetry.scripts]
+mwmbl-tinysearchengine = "mwmbl.tinysearchengine.app:main"