Get Dockerfile working
This commit is contained in:
parent
9c65bf3c8f
commit
7e520fb32f
8 changed files with 7885 additions and 11 deletions
7
.dockerignore
Normal file
7
.dockerignore
Normal file
|
@ -0,0 +1,7 @@
|
|||
Dockerfile
|
||||
README.md
|
||||
*.pyc
|
||||
*.pyo
|
||||
*.pyd
|
||||
__pycache__
|
||||
.pytest_cache
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
|||
./data
|
||||
.idea
|
||||
*~
|
||||
|
|
|
@ -27,7 +27,8 @@ FROM base as final
|
|||
|
||||
#RUN apk add --no-cache libffi libpq
|
||||
COPY --from=builder /venv /venv
|
||||
COPY data /data
|
||||
#COPY docker-entrypoint.sh wsgi.py ./
|
||||
#CMD ["./docker-entrypoint.sh"]
|
||||
|
||||
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
|
||||
CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]
|
||||
|
|
7861
hn-top-domains-filtered.py
Normal file
7861
hn-top-domains-filtered.py
Normal file
File diff suppressed because it is too large
Load diff
4
paths.py
4
paths.py
|
@ -9,13 +9,13 @@ COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
|
|||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||
CRAWL_PREFIX = 'crawl_'
|
||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
|
||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
||||
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
||||
|
||||
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
||||
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
|
||||
INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
import uvicorn
|
||||
|
||||
from tinysearchengine import create_app
|
||||
|
||||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
from paths import INDEX_PATH
|
||||
|
||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
app = create_app.create(tiny_index)
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
||||
index_path = sys.argv[1]
|
||||
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
|
||||
app = create_app.create(tiny_index)
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
|
||||
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import re
|
||||
from logging import getLogger
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI
|
||||
from starlette.responses import FileResponse
|
||||
|
@ -11,6 +12,7 @@ from tinysearchengine.indexer import TinyIndex, Document
|
|||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
STATIC_FILES_PATH = Path(__file__).parent / 'static'
|
||||
SCORE_THRESHOLD = 0.25
|
||||
|
||||
|
||||
|
@ -107,7 +109,7 @@ def create(tiny_index: TinyIndex):
|
|||
|
||||
@app.get('/')
|
||||
def index():
|
||||
return FileResponse('tinysearchengine/static/index.html')
|
||||
return FileResponse(STATIC_FILES_PATH / 'index.html')
|
||||
|
||||
app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
|
||||
app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static")
|
||||
return app
|
||||
|
|
|
@ -2,11 +2,13 @@ import json
|
|||
import os
|
||||
from dataclasses import astuple, dataclass
|
||||
from mmap import mmap, PROT_READ
|
||||
from pathlib import Path
|
||||
from typing import TypeVar, Generic, Callable, List
|
||||
|
||||
import mmh3
|
||||
from zstandard import ZstdDecompressor
|
||||
|
||||
|
||||
NUM_PAGES = 25600
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
|
Loading…
Reference in a new issue