Get Dockerfile working

This commit is contained in:
Daoud Clarke 2021-12-23 21:30:51 +00:00
parent 9c65bf3c8f
commit 7e520fb32f
8 changed files with 7885 additions and 11 deletions

7
.dockerignore Normal file
View file

@ -0,0 +1,7 @@
Dockerfile
README.md
*.pyc
*.pyo
*.pyd
__pycache__
.pytest_cache

1
.gitignore vendored
View file

@ -1,2 +1,3 @@
./data
.idea
*~

View file

@ -27,7 +27,8 @@ FROM base as final
#RUN apk add --no-cache libffi libpq
COPY --from=builder /venv /venv
COPY data /data
#COPY docker-entrypoint.sh wsgi.py ./
#CMD ["./docker-entrypoint.sh"]
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]

7861
hn-top-domains-filtered.py Normal file

File diff suppressed because it is too large Load diff

View file

@ -9,13 +9,13 @@ COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'

View file

@ -1,17 +1,17 @@
import logging
import sys
import uvicorn
from tinysearchengine import create_app
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
logging.basicConfig()
index_path = sys.argv[1]
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
if __name__ == "__main__":
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info")

View file

@ -1,6 +1,7 @@
import re
from logging import getLogger
from operator import itemgetter
from pathlib import Path
from fastapi import FastAPI
from starlette.responses import FileResponse
@ -11,6 +12,7 @@ from tinysearchengine.indexer import TinyIndex, Document
logger = getLogger(__name__)
STATIC_FILES_PATH = Path(__file__).parent / 'static'
SCORE_THRESHOLD = 0.25
@ -107,7 +109,7 @@ def create(tiny_index: TinyIndex):
@app.get('/')
def index():
return FileResponse('tinysearchengine/static/index.html')
return FileResponse(STATIC_FILES_PATH / 'index.html')
app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static")
return app

View file

@ -2,11 +2,13 @@ import json
import os
from dataclasses import astuple, dataclass
from mmap import mmap, PROT_READ
from pathlib import Path
from typing import TypeVar, Generic, Callable, List
import mmh3
from zstandard import ZstdDecompressor
NUM_PAGES = 25600
PAGE_SIZE = 4096