WIP: implement docker image. TODO: copy index and set the correct index path using env var

2021-12-22 23:21:23 +00:00 · 2021-12-22 23:21:23 +00:00 · 9c65bf3c8f
commit 9c65bf3c8f
parent f754b38f71
25 changed files with 282 additions and 1241 deletions
--- a/33
+++ b/33
@ -0,0 +1,33 @@
 FROM python:3.9-slim-bullseye as base
 ENV PYTHONFAULTHANDLER=1 \
    PYTHONHASHSEED=random \
    PYTHONUNBUFFERED=1
 WORKDIR /app
 FROM base as builder
 ENV PIP_DEFAULT_TIMEOUT=100 \
    PIP_DISABLE_PIP_VERSION_CHECK=1 \
    PIP_NO_CACHE_DIR=1 \
    POETRY_VERSION=1.1.12
 # RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
 RUN pip install "poetry==$POETRY_VERSION"
 RUN python -m venv /venv
 COPY pyproject.toml poetry.lock ./
 RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
 COPY . .
 RUN poetry build && /venv/bin/pip install dist/*.whl
 FROM base as final
 #RUN apk add --no-cache libffi libpq
 COPY --from=builder /venv /venv
 #COPY docker-entrypoint.sh wsgi.py ./
 #CMD ["./docker-entrypoint.sh"]
 CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,4 @@
 Tiny Search Engine
 ==================
 TBD
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -1,4 +1,4 @@
-from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
+from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
 from paths import INDEX_PATH
--- a/app.py
+++ b/app.py
@ -1,17 +0,0 @@
 import logging
 import uvicorn
 import create_app
 from index import TinyIndex, PAGE_SIZE, NUM_PAGES, Document
 from paths import INDEX_PATH
 tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
 app = create_app.create(tiny_index)
 logging.basicConfig()
 if __name__ == "__main__":
    uvicorn.run("app:app", host="127.0.0.1", port=8000, log_level="info", reload=True)
--- a/index.py
+++ b/index.py
@ -1,26 +1,16 @@
 """
 Create a search index
 """
 import json
 import os
 from abc import ABC, abstractmethod
 from collections import Counter
 from dataclasses import dataclass, fields, asdict, astuple
 from itertools import islice
-from mmap import mmap, PROT_READ
+from typing import Iterator, Iterable
 from typing import List, Iterator, TypeVar, Generic, Iterable, Callable
 from urllib.parse import unquote
 import justext
 import mmh3
 import pandas as pd
 from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
-NUM_PAGES = 25600
+from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
 PAGE_SIZE = 4096
 NUM_INITIAL_TOKENS = 50
@ -42,133 +32,6 @@ def tokenize(nlp, cleaned_text):
    return lowered
 def clean(content):
    text = justext.justext(content, justext.get_stoplist("English"))
    pars = [par.text for par in text if not par.is_boilerplate]
    cleaned_text = ' '.join(pars)
    return cleaned_text
@dataclass
 class Document:
    title: str
    url: str
    extract: str
@dataclass
 class TokenizedDocument(Document):
    tokens: List[str]
 T = TypeVar('T')
 class TinyIndexBase(Generic[T]):
    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
        self.item_factory = item_factory
        self.num_pages = num_pages
        self.page_size = page_size
        self.decompressor = ZstdDecompressor()
        self.mmap = None
    def retrieve(self, key: str) -> List[T]:
        index = self._get_key_page_index(key)
        page = self.get_page(index)
        if page is None:
            return []
        # print("REtrieve", self.index_path, page)
        return self.convert_items(page)
    def _get_key_page_index(self, key):
        key_hash = mmh3.hash(key, signed=False)
        return key_hash % self.num_pages
    def get_page(self, i):
        """
        Get the page at index i, decompress and deserialise it using JSON
        """
        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
        try:
            decompressed_data = self.decompressor.decompress(page_data)
        except ZstdError:
            return None
        results = json.loads(decompressed_data.decode('utf8'))
        # print(f"Num results: {len(results)}, num zeros: {zeros}")
        return results
    def convert_items(self, items) -> List[T]:
        converted = [self.item_factory(*item) for item in items]
        # print("Converted", items, converted)
        return converted
 class TinyIndex(TinyIndexBase[T]):
    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
        super().__init__(item_factory, num_pages, page_size)
        # print("REtrieve path", index_path)
        self.index_path = index_path
        self.index_file = open(self.index_path, 'rb')
        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
 class TinyIndexer(TinyIndexBase[T]):
    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
        super().__init__(item_factory, num_pages, page_size)
        self.index_path = index_path
        self.compressor = ZstdCompressor()
        self.decompressor = ZstdDecompressor()
        self.index_file = None
        self.mmap = None
    def __enter__(self):
        self.create_if_not_exists()
        self.index_file = open(self.index_path, 'r+b')
        self.mmap = mmap(self.index_file.fileno(), 0)
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.mmap.close()
        self.index_file.close()
    def index(self, key: str, value: T):
        # print("Index", value)
        assert type(value) == self.item_factory, f"Can only index the specified type" \
                                              f" ({self.item_factory.__name__})"
        page_index = self._get_key_page_index(key)
        current_page = self.get_page(page_index)
        if current_page is None:
            current_page = []
        value_tuple = astuple(value)
        # print("Value tuple", value_tuple)
        current_page.append(value_tuple)
        try:
            # print("Page", current_page)
            self._write_page(current_page, page_index)
        except ValueError:
            pass
    def _write_page(self, data, i):
        """
        Serialise the data using JSON, compress it and store it at index i.
        If the data is too big, it will raise a ValueError and not store anything
        """
        serialised_data = json.dumps(data)
        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
        page_length = len(compressed_data)
        if page_length > self.page_size:
            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
        padding = b'\x00' * (self.page_size - page_length)
        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
    def create_if_not_exists(self):
        if not os.path.isfile(self.index_path):
            file_length = self.num_pages * self.page_size
            with open(self.index_path, 'wb') as index_file:
                index_file.write(b'\x00' * file_length)
 def prepare_url_for_tokenizing(url: str):
    if url.startswith(HTTP_START):
        url = url[len(HTTP_START):]
--- a/index_glob.py
+++ b/index_glob.py
@ -4,7 +4,8 @@ from glob import glob
 import bs4
 from spacy.lang.en import English
-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
+from index import tokenize
 from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
 from paths import INDEX_PATH, CRAWL_GLOB
@ -36,3 +37,10 @@ def run():
 if __name__ == '__main__':
    run()
 def clean(content):
    text = justext.justext(content, justext.get_stoplist("English"))
    pars = [par.text for par in text if not par.is_boilerplate]
    cleaned_text = ' '.join(pars)
    return cleaned_text
--- a/index_queue.py
+++ b/index_queue.py
@ -4,7 +4,8 @@ Index items in the file-system queue
 from spacy.lang.en import English
 from fsqueue import FSQueue, ZstdJsonSerializer
-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
+from index import index_titles_urls_and_extracts
 from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
 from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
--- a/indexcc.py
+++ b/indexcc.py
@ -8,7 +8,8 @@ from logging import getLogger
 import spacy
 from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
-from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
+from index import index_titles_urls_and_extracts
 from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
 from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
--- a/performance.py
+++ b/performance.py
@ -8,9 +8,10 @@ import numpy as np
 from spacy.lang.en import English
 from starlette.testclient import TestClient
-import create_app
+from tinysearchengine import create_app
 from fsqueue import ZstdJsonSerializer
-from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
+from index import index_titles_urls_and_extracts
 from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
 from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
 NUM_DOCUMENTS = 30000
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -6,26 +6,26 @@ authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
 [tool.poetry.dependencies]
 python = "^3.9"
-botocore = "^1.23.20"
+# botocore = "^1.23.20"
-boto3 = "^1.20.20"
+# boto3 = "^1.20.20"
-ujson = "^4.3.0"
+# ujson = "^4.3.0"
-warcio = "^1.7.4"
+# warcio = "^1.7.4"
-idna = "^3.3"
+# idna = "^3.3"
-beautifulsoup4 = "^4.10.0"
+# beautifulsoup4 = "^4.10.0"
-lxml = "^4.6.4"
+# lxml = "^4.6.4"
-jusText = "^3.0.0"
+# jusText = "^3.0.0"
 pandas = "^1.3.4"
-pyspark = "^3.2.0"
+# pyspark = "^3.2.0"
-langdetect = "^1.0.9"
+# langdetect = "^1.0.9"
 zstandard = "^0.16.0"
-spacy = "^3.2.1"
+# spacy = "^3.2.1"
 mmh3 = "^3.0.0"
 fastapi = "^0.70.1"
-Levenshtein = "^0.16.0"
+# Levenshtein = "^0.16.0"
 uvicorn = "^0.16.0"
-[tool.poetry.dependencies.en_core_web_sm]
+# [tool.poetry.dependencies.en_core_web_sm]
-url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
+# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
 [tool.poetry.dev-dependencies]
--- a/setup.cfg
+++ b/setup.cfg
@ -0,0 +1,24 @@
 [metadata]
 name = tiny-search-engine-daoudc
 version = 0.0.1
 author = Daoud Clarke
 author_email = daoud.clarke@gmail.com
 description = Tiny Search Engine
 long_description = file: README.md
 long_description_content_type = text/markdown
 # url = https://github.com/pypa/sampleproject
 # project_urls =
 #     Bug Tracker = https://github.com/pypa/sampleproject/issues
 # classifiers =
 #     Programming Language :: Python :: 3
 #     License :: OSI Approved :: MIT License
 #     Operating System :: OS Independent
 [options]
 package_dir =
    = src
 packages = find:
 python_requires = >=3.9
 [options.packages.find]
 where = src
--- a/tinysearchengine/init.py
+++ b/tinysearchengine/init.py
--- a/tinysearchengine/app.py
+++ b/tinysearchengine/app.py
@ -0,0 +1,17 @@
 import logging
 import uvicorn
 from tinysearchengine import create_app
 from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
 from paths import INDEX_PATH
 tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
 app = create_app.create(tiny_index)
 logging.basicConfig()
 if __name__ == "__main__":
    uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
--- a/tinysearchengine/create_app.py
+++ b/tinysearchengine/create_app.py
@ -6,7 +6,7 @@ from fastapi import FastAPI
 from starlette.responses import FileResponse
 from starlette.staticfiles import StaticFiles
-from index import TinyIndex, Document
+from tinysearchengine.indexer import TinyIndex, Document
 logger = getLogger(__name__)
@ -107,7 +107,7 @@ def create(tiny_index: TinyIndex):
    @app.get('/')
    def index():
-        return FileResponse('static/index.html')
+        return FileResponse('tinysearchengine/static/index.html')
-    app.mount('/', StaticFiles(directory="static"), name="static")
+    app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
    return app
--- a/tinysearchengine/indexer.py
+++ b/tinysearchengine/indexer.py
@ -0,0 +1,131 @@
 import json
 import os
 from dataclasses import astuple, dataclass
 from mmap import mmap, PROT_READ
 from typing import TypeVar, Generic, Callable, List
 import mmh3
 from zstandard import ZstdDecompressor
 NUM_PAGES = 25600
 PAGE_SIZE = 4096
@dataclass
 class Document:
    title: str
    url: str
    extract: str
@dataclass
 class TokenizedDocument(Document):
    tokens: List[str]
 T = TypeVar('T')
 class TinyIndexBase(Generic[T]):
    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
        self.item_factory = item_factory
        self.num_pages = num_pages
        self.page_size = page_size
        self.decompressor = ZstdDecompressor()
        self.mmap = None
    def retrieve(self, key: str) -> List[T]:
        index = self._get_key_page_index(key)
        page = self.get_page(index)
        if page is None:
            return []
        # print("REtrieve", self.index_path, page)
        return self.convert_items(page)
    def _get_key_page_index(self, key):
        key_hash = mmh3.hash(key, signed=False)
        return key_hash % self.num_pages
    def get_page(self, i):
        """
        Get the page at index i, decompress and deserialise it using JSON
        """
        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
        try:
            decompressed_data = self.decompressor.decompress(page_data)
        except ZstdError:
            return None
        results = json.loads(decompressed_data.decode('utf8'))
        # print(f"Num results: {len(results)}, num zeros: {zeros}")
        return results
    def convert_items(self, items) -> List[T]:
        converted = [self.item_factory(*item) for item in items]
        # print("Converted", items, converted)
        return converted
 class TinyIndex(TinyIndexBase[T]):
    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
        super().__init__(item_factory, num_pages, page_size)
        # print("REtrieve path", index_path)
        self.index_path = index_path
        self.index_file = open(self.index_path, 'rb')
        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
 class TinyIndexer(TinyIndexBase[T]):
    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
        super().__init__(item_factory, num_pages, page_size)
        self.index_path = index_path
        self.compressor = ZstdCompressor()
        self.decompressor = ZstdDecompressor()
        self.index_file = None
        self.mmap = None
    def __enter__(self):
        self.create_if_not_exists()
        self.index_file = open(self.index_path, 'r+b')
        self.mmap = mmap(self.index_file.fileno(), 0)
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.mmap.close()
        self.index_file.close()
    def index(self, key: str, value: T):
        # print("Index", value)
        assert type(value) == self.item_factory, f"Can only index the specified type" \
                                              f" ({self.item_factory.__name__})"
        page_index = self._get_key_page_index(key)
        current_page = self.get_page(page_index)
        if current_page is None:
            current_page = []
        value_tuple = astuple(value)
        # print("Value tuple", value_tuple)
        current_page.append(value_tuple)
        try:
            # print("Page", current_page)
            self._write_page(current_page, page_index)
        except ValueError:
            pass
    def _write_page(self, data, i):
        """
        Serialise the data using JSON, compress it and store it at index i.
        If the data is too big, it will raise a ValueError and not store anything
        """
        serialised_data = json.dumps(data)
        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
        page_length = len(compressed_data)
        if page_length > self.page_size:
            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
        padding = b'\x00' * (self.page_size - page_length)
        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
    def create_if_not_exists(self):
        if not os.path.isfile(self.index_path):
            file_length = self.num_pages * self.page_size
            with open(self.index_path, 'wb') as index_file:
                index_file.write(b'\x00' * file_length)
--- a/tinysearchengine/static/index.css
+++ b/tinysearchengine/static/index.css
--- a/tinysearchengine/static/index.html
+++ b/tinysearchengine/static/index.html
--- a/tinysearchengine/static/index.js
+++ b/tinysearchengine/static/index.js
--- a/tinysearchengine/static/landing.html
+++ b/tinysearchengine/static/landing.html
--- a/tinysearchengine/static/plugin.xml
+++ b/tinysearchengine/static/plugin.xml
--- a/tinysearchengine/static/search.html
+++ b/tinysearchengine/static/search.html
--- a/tinysearchengine/static/typeahead.css
+++ b/tinysearchengine/static/typeahead.css
--- a/tinysearchengine/static/typeahead.js
+++ b/tinysearchengine/static/typeahead.js
--- a/wiki.py
+++ b/wiki.py
@ -7,7 +7,8 @@ from urllib.parse import quote
 from spacy.lang.en import English
-from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
+from index import index_titles_urls_and_extracts
 from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
 from paths import WIKI_TITLES_PATH, INDEX_PATH
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
`@ -1,4 +1,4 @@`
	`from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE`	`from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document`
	`from paths import INDEX_PATH`	`from paths import INDEX_PATH`