Get Dockerfile working
This commit is contained in:
parent
9c65bf3c8f
commit
7e520fb32f
8 changed files with 7885 additions and 11 deletions
7
.dockerignore
Normal file
7
.dockerignore
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
Dockerfile
|
||||||
|
README.md
|
||||||
|
*.pyc
|
||||||
|
*.pyo
|
||||||
|
*.pyd
|
||||||
|
__pycache__
|
||||||
|
.pytest_cache
|
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -1,2 +1,3 @@
|
||||||
|
./data
|
||||||
.idea
|
.idea
|
||||||
*~
|
*~
|
||||||
|
|
|
@ -27,7 +27,8 @@ FROM base as final
|
||||||
|
|
||||||
#RUN apk add --no-cache libffi libpq
|
#RUN apk add --no-cache libffi libpq
|
||||||
COPY --from=builder /venv /venv
|
COPY --from=builder /venv /venv
|
||||||
|
COPY data /data
|
||||||
#COPY docker-entrypoint.sh wsgi.py ./
|
#COPY docker-entrypoint.sh wsgi.py ./
|
||||||
#CMD ["./docker-entrypoint.sh"]
|
#CMD ["./docker-entrypoint.sh"]
|
||||||
|
|
||||||
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
|
CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]
|
||||||
|
|
7861
hn-top-domains-filtered.py
Normal file
7861
hn-top-domains-filtered.py
Normal file
File diff suppressed because it is too large
Load diff
4
paths.py
4
paths.py
|
@ -9,13 +9,13 @@ COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
|
||||||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||||
CRAWL_PREFIX = 'crawl_'
|
CRAWL_PREFIX = 'crawl_'
|
||||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
|
|
||||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
||||||
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
|
||||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||||
|
|
||||||
|
|
||||||
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
||||||
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
||||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||||
|
|
||||||
|
INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'
|
||||||
|
|
|
@ -1,17 +1,17 @@
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
|
|
||||||
from tinysearchengine import create_app
|
from tinysearchengine import create_app
|
||||||
|
|
||||||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||||
from paths import INDEX_PATH
|
|
||||||
|
|
||||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
|
||||||
app = create_app.create(tiny_index)
|
|
||||||
|
|
||||||
logging.basicConfig()
|
logging.basicConfig()
|
||||||
|
|
||||||
|
|
||||||
|
index_path = sys.argv[1]
|
||||||
|
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
|
||||||
|
app = create_app.create(tiny_index)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
|
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info")
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import re
|
import re
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from starlette.responses import FileResponse
|
from starlette.responses import FileResponse
|
||||||
|
@ -11,6 +12,7 @@ from tinysearchengine.indexer import TinyIndex, Document
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
STATIC_FILES_PATH = Path(__file__).parent / 'static'
|
||||||
SCORE_THRESHOLD = 0.25
|
SCORE_THRESHOLD = 0.25
|
||||||
|
|
||||||
|
|
||||||
|
@ -107,7 +109,7 @@ def create(tiny_index: TinyIndex):
|
||||||
|
|
||||||
@app.get('/')
|
@app.get('/')
|
||||||
def index():
|
def index():
|
||||||
return FileResponse('tinysearchengine/static/index.html')
|
return FileResponse(STATIC_FILES_PATH / 'index.html')
|
||||||
|
|
||||||
app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
|
app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static")
|
||||||
return app
|
return app
|
||||||
|
|
|
@ -2,11 +2,13 @@ import json
|
||||||
import os
|
import os
|
||||||
from dataclasses import astuple, dataclass
|
from dataclasses import astuple, dataclass
|
||||||
from mmap import mmap, PROT_READ
|
from mmap import mmap, PROT_READ
|
||||||
|
from pathlib import Path
|
||||||
from typing import TypeVar, Generic, Callable, List
|
from typing import TypeVar, Generic, Callable, List
|
||||||
|
|
||||||
import mmh3
|
import mmh3
|
||||||
from zstandard import ZstdDecompressor
|
from zstandard import ZstdDecompressor
|
||||||
|
|
||||||
|
|
||||||
NUM_PAGES = 25600
|
NUM_PAGES = 25600
|
||||||
PAGE_SIZE = 4096
|
PAGE_SIZE = 4096
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue