ソースを参照

WIP: implement docker image. TODO: copy index and set the correct index path using env var

Daoud Clarke 3 年 前
コミット
9c65bf3c8f

+ 33 - 0
Dockerfile

@@ -0,0 +1,33 @@
+FROM python:3.9-slim-bullseye as base
+
+ENV PYTHONFAULTHANDLER=1 \
+    PYTHONHASHSEED=random \
+    PYTHONUNBUFFERED=1
+
+WORKDIR /app
+
+FROM base as builder
+
+ENV PIP_DEFAULT_TIMEOUT=100 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    PIP_NO_CACHE_DIR=1 \
+    POETRY_VERSION=1.1.12
+
+# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
+RUN pip install "poetry==$POETRY_VERSION"
+RUN python -m venv /venv
+
+COPY pyproject.toml poetry.lock ./
+RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
+
+COPY . .
+RUN poetry build && /venv/bin/pip install dist/*.whl
+
+FROM base as final
+
+#RUN apk add --no-cache libffi libpq
+COPY --from=builder /venv /venv
+#COPY docker-entrypoint.sh wsgi.py ./
+#CMD ["./docker-entrypoint.sh"]
+
+CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]

+ 4 - 0
README.md

@@ -0,0 +1,4 @@
+Tiny Search Engine
+==================
+
+TBD

+ 1 - 1
analyse/inspect_index.py

@@ -1,4 +1,4 @@
-from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
+from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
 from paths import INDEX_PATH
 
 

+ 0 - 17
app.py

@@ -1,17 +0,0 @@
-import logging
-
-import uvicorn
-
-import create_app
-
-from index import TinyIndex, PAGE_SIZE, NUM_PAGES, Document
-from paths import INDEX_PATH
-
-tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
-app = create_app.create(tiny_index)
-
-logging.basicConfig()
-
-
-if __name__ == "__main__":
-    uvicorn.run("app:app", host="127.0.0.1", port=8000, log_level="info", reload=True)

+ 2 - 139
index.py

@@ -1,26 +1,16 @@
 """
 Create a search index
 """
-import json
-import os
-from abc import ABC, abstractmethod
 from collections import Counter
-from dataclasses import dataclass, fields, asdict, astuple
 from itertools import islice
-from mmap import mmap, PROT_READ
-from typing import List, Iterator, TypeVar, Generic, Iterable, Callable
+from typing import Iterator, Iterable
 from urllib.parse import unquote
 
-import justext
-import mmh3
 import pandas as pd
-from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
 
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
-NUM_PAGES = 25600
-PAGE_SIZE = 4096
-
+from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
 
 NUM_INITIAL_TOKENS = 50
 
@@ -42,133 +32,6 @@ def tokenize(nlp, cleaned_text):
     return lowered
 
 
-def clean(content):
-    text = justext.justext(content, justext.get_stoplist("English"))
-    pars = [par.text for par in text if not par.is_boilerplate]
-    cleaned_text = ' '.join(pars)
-    return cleaned_text
-
-
-@dataclass
-class Document:
-    title: str
-    url: str
-    extract: str
-
-
-@dataclass
-class TokenizedDocument(Document):
-    tokens: List[str]
-
-
-T = TypeVar('T')
-
-
-class TinyIndexBase(Generic[T]):
-    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
-        self.item_factory = item_factory
-        self.num_pages = num_pages
-        self.page_size = page_size
-        self.decompressor = ZstdDecompressor()
-        self.mmap = None
-
-    def retrieve(self, key: str) -> List[T]:
-        index = self._get_key_page_index(key)
-        page = self.get_page(index)
-        if page is None:
-            return []
-        # print("REtrieve", self.index_path, page)
-        return self.convert_items(page)
-
-    def _get_key_page_index(self, key):
-        key_hash = mmh3.hash(key, signed=False)
-        return key_hash % self.num_pages
-
-    def get_page(self, i):
-        """
-        Get the page at index i, decompress and deserialise it using JSON
-        """
-        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
-        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
-        try:
-            decompressed_data = self.decompressor.decompress(page_data)
-        except ZstdError:
-            return None
-        results = json.loads(decompressed_data.decode('utf8'))
-        # print(f"Num results: {len(results)}, num zeros: {zeros}")
-        return results
-
-    def convert_items(self, items) -> List[T]:
-        converted = [self.item_factory(*item) for item in items]
-        # print("Converted", items, converted)
-        return converted
-
-
-class TinyIndex(TinyIndexBase[T]):
-    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
-        super().__init__(item_factory, num_pages, page_size)
-        # print("REtrieve path", index_path)
-        self.index_path = index_path
-        self.index_file = open(self.index_path, 'rb')
-        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
-
-
-class TinyIndexer(TinyIndexBase[T]):
-    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
-        super().__init__(item_factory, num_pages, page_size)
-        self.index_path = index_path
-        self.compressor = ZstdCompressor()
-        self.decompressor = ZstdDecompressor()
-        self.index_file = None
-        self.mmap = None
-
-    def __enter__(self):
-        self.create_if_not_exists()
-        self.index_file = open(self.index_path, 'r+b')
-        self.mmap = mmap(self.index_file.fileno(), 0)
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.mmap.close()
-        self.index_file.close()
-
-    def index(self, key: str, value: T):
-        # print("Index", value)
-        assert type(value) == self.item_factory, f"Can only index the specified type" \
-                                              f" ({self.item_factory.__name__})"
-        page_index = self._get_key_page_index(key)
-        current_page = self.get_page(page_index)
-        if current_page is None:
-            current_page = []
-        value_tuple = astuple(value)
-        # print("Value tuple", value_tuple)
-        current_page.append(value_tuple)
-        try:
-            # print("Page", current_page)
-            self._write_page(current_page, page_index)
-        except ValueError:
-            pass
-
-    def _write_page(self, data, i):
-        """
-        Serialise the data using JSON, compress it and store it at index i.
-        If the data is too big, it will raise a ValueError and not store anything
-        """
-        serialised_data = json.dumps(data)
-        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
-        page_length = len(compressed_data)
-        if page_length > self.page_size:
-            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
-        padding = b'\x00' * (self.page_size - page_length)
-        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
-
-    def create_if_not_exists(self):
-        if not os.path.isfile(self.index_path):
-            file_length = self.num_pages * self.page_size
-            with open(self.index_path, 'wb') as index_file:
-                index_file.write(b'\x00' * file_length)
-
-
 def prepare_url_for_tokenizing(url: str):
     if url.startswith(HTTP_START):
         url = url[len(HTTP_START):]

+ 9 - 1
index_glob.py

@@ -4,7 +4,8 @@ from glob import glob
 import bs4
 from spacy.lang.en import English
 
-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
+from index import tokenize
+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
 from paths import INDEX_PATH, CRAWL_GLOB
 
 
@@ -36,3 +37,10 @@ def run():
 
 if __name__ == '__main__':
     run()
+
+
+def clean(content):
+    text = justext.justext(content, justext.get_stoplist("English"))
+    pars = [par.text for par in text if not par.is_boilerplate]
+    cleaned_text = ' '.join(pars)
+    return cleaned_text

+ 2 - 1
index_queue.py

@@ -4,7 +4,8 @@ Index items in the file-system queue
 from spacy.lang.en import English
 
 from fsqueue import FSQueue, ZstdJsonSerializer
-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
+from index import index_titles_urls_and_extracts
+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
 from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
 
 

+ 2 - 1
indexcc.py

@@ -8,7 +8,8 @@ from logging import getLogger
 import spacy
 
 from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
-from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
+from index import index_titles_urls_and_extracts
+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
 from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
 
 

+ 3 - 2
performance.py

@@ -8,9 +8,10 @@ import numpy as np
 from spacy.lang.en import English
 from starlette.testclient import TestClient
 
-import create_app
+from tinysearchengine import create_app
 from fsqueue import ZstdJsonSerializer
-from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
+from index import index_titles_urls_and_extracts
+from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
 from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
 
 NUM_DOCUMENTS = 30000

ファイルの差分が大きいため隠しています
+ 35 - 828
poetry.lock


+ 14 - 14
pyproject.toml

@@ -6,26 +6,26 @@ authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
 
 [tool.poetry.dependencies]
 python = "^3.9"
-botocore = "^1.23.20"
-boto3 = "^1.20.20"
-ujson = "^4.3.0"
-warcio = "^1.7.4"
-idna = "^3.3"
-beautifulsoup4 = "^4.10.0"
-lxml = "^4.6.4"
-jusText = "^3.0.0"
+# botocore = "^1.23.20"
+# boto3 = "^1.20.20"
+# ujson = "^4.3.0"
+# warcio = "^1.7.4"
+# idna = "^3.3"
+# beautifulsoup4 = "^4.10.0"
+# lxml = "^4.6.4"
+# jusText = "^3.0.0"
 pandas = "^1.3.4"
-pyspark = "^3.2.0"
-langdetect = "^1.0.9"
+# pyspark = "^3.2.0"
+# langdetect = "^1.0.9"
 zstandard = "^0.16.0"
-spacy = "^3.2.1"
+# spacy = "^3.2.1"
 mmh3 = "^3.0.0"
 fastapi = "^0.70.1"
-Levenshtein = "^0.16.0"
+# Levenshtein = "^0.16.0"
 uvicorn = "^0.16.0"
 
-[tool.poetry.dependencies.en_core_web_sm]
-url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
+# [tool.poetry.dependencies.en_core_web_sm]
+# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
 
 [tool.poetry.dev-dependencies]
 

+ 24 - 0
setup.cfg

@@ -0,0 +1,24 @@
+[metadata]
+name = tiny-search-engine-daoudc
+version = 0.0.1
+author = Daoud Clarke
+author_email = daoud.clarke@gmail.com
+description = Tiny Search Engine
+long_description = file: README.md
+long_description_content_type = text/markdown
+# url = https://github.com/pypa/sampleproject
+# project_urls =
+#     Bug Tracker = https://github.com/pypa/sampleproject/issues
+# classifiers =
+#     Programming Language :: Python :: 3
+#     License :: OSI Approved :: MIT License
+#     Operating System :: OS Independent
+
+[options]
+package_dir =
+    = src
+packages = find:
+python_requires = >=3.9
+
+[options.packages.find]
+where = src

+ 0 - 0
tinysearchengine/__init__.py


+ 17 - 0
tinysearchengine/app.py

@@ -0,0 +1,17 @@
+import logging
+
+import uvicorn
+
+from tinysearchengine import create_app
+
+from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
+from paths import INDEX_PATH
+
+tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
+app = create_app.create(tiny_index)
+
+logging.basicConfig()
+
+
+if __name__ == "__main__":
+    uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)

+ 3 - 3
create_app.py → tinysearchengine/create_app.py

@@ -6,7 +6,7 @@ from fastapi import FastAPI
 from starlette.responses import FileResponse
 from starlette.staticfiles import StaticFiles
 
-from index import TinyIndex, Document
+from tinysearchengine.indexer import TinyIndex, Document
 
 logger = getLogger(__name__)
 
@@ -107,7 +107,7 @@ def create(tiny_index: TinyIndex):
 
     @app.get('/')
     def index():
-        return FileResponse('static/index.html')
+        return FileResponse('tinysearchengine/static/index.html')
 
-    app.mount('/', StaticFiles(directory="static"), name="static")
+    app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
     return app

+ 131 - 0
tinysearchengine/indexer.py

@@ -0,0 +1,131 @@
+import json
+import os
+from dataclasses import astuple, dataclass
+from mmap import mmap, PROT_READ
+from typing import TypeVar, Generic, Callable, List
+
+import mmh3
+from zstandard import ZstdDecompressor
+
+NUM_PAGES = 25600
+PAGE_SIZE = 4096
+
+
+@dataclass
+class Document:
+    title: str
+    url: str
+    extract: str
+
+
+@dataclass
+class TokenizedDocument(Document):
+    tokens: List[str]
+
+
+T = TypeVar('T')
+
+
+class TinyIndexBase(Generic[T]):
+    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
+        self.item_factory = item_factory
+        self.num_pages = num_pages
+        self.page_size = page_size
+        self.decompressor = ZstdDecompressor()
+        self.mmap = None
+
+    def retrieve(self, key: str) -> List[T]:
+        index = self._get_key_page_index(key)
+        page = self.get_page(index)
+        if page is None:
+            return []
+        # print("REtrieve", self.index_path, page)
+        return self.convert_items(page)
+
+    def _get_key_page_index(self, key):
+        key_hash = mmh3.hash(key, signed=False)
+        return key_hash % self.num_pages
+
+    def get_page(self, i):
+        """
+        Get the page at index i, decompress and deserialise it using JSON
+        """
+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
+        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
+        try:
+            decompressed_data = self.decompressor.decompress(page_data)
+        except ZstdError:
+            return None
+        results = json.loads(decompressed_data.decode('utf8'))
+        # print(f"Num results: {len(results)}, num zeros: {zeros}")
+        return results
+
+    def convert_items(self, items) -> List[T]:
+        converted = [self.item_factory(*item) for item in items]
+        # print("Converted", items, converted)
+        return converted
+
+
+class TinyIndex(TinyIndexBase[T]):
+    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
+        super().__init__(item_factory, num_pages, page_size)
+        # print("REtrieve path", index_path)
+        self.index_path = index_path
+        self.index_file = open(self.index_path, 'rb')
+        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
+
+
+class TinyIndexer(TinyIndexBase[T]):
+    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
+        super().__init__(item_factory, num_pages, page_size)
+        self.index_path = index_path
+        self.compressor = ZstdCompressor()
+        self.decompressor = ZstdDecompressor()
+        self.index_file = None
+        self.mmap = None
+
+    def __enter__(self):
+        self.create_if_not_exists()
+        self.index_file = open(self.index_path, 'r+b')
+        self.mmap = mmap(self.index_file.fileno(), 0)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.mmap.close()
+        self.index_file.close()
+
+    def index(self, key: str, value: T):
+        # print("Index", value)
+        assert type(value) == self.item_factory, f"Can only index the specified type" \
+                                              f" ({self.item_factory.__name__})"
+        page_index = self._get_key_page_index(key)
+        current_page = self.get_page(page_index)
+        if current_page is None:
+            current_page = []
+        value_tuple = astuple(value)
+        # print("Value tuple", value_tuple)
+        current_page.append(value_tuple)
+        try:
+            # print("Page", current_page)
+            self._write_page(current_page, page_index)
+        except ValueError:
+            pass
+
+    def _write_page(self, data, i):
+        """
+        Serialise the data using JSON, compress it and store it at index i.
+        If the data is too big, it will raise a ValueError and not store anything
+        """
+        serialised_data = json.dumps(data)
+        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
+        page_length = len(compressed_data)
+        if page_length > self.page_size:
+            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
+        padding = b'\x00' * (self.page_size - page_length)
+        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
+
+    def create_if_not_exists(self):
+        if not os.path.isfile(self.index_path):
+            file_length = self.num_pages * self.page_size
+            with open(self.index_path, 'wb') as index_file:
+                index_file.write(b'\x00' * file_length)

+ 0 - 0
static/index.css → tinysearchengine/static/index.css


+ 0 - 0
static/index.html → tinysearchengine/static/index.html


+ 0 - 0
static/index.js → tinysearchengine/static/index.js


+ 0 - 0
static/landing.html → tinysearchengine/static/landing.html


+ 0 - 0
static/plugin.xml → tinysearchengine/static/plugin.xml


+ 0 - 0
static/search.html → tinysearchengine/static/search.html


+ 0 - 0
static/typeahead.css → tinysearchengine/static/typeahead.css


+ 0 - 0
static/typeahead.js → tinysearchengine/static/typeahead.js


+ 2 - 1
wiki.py

@@ -7,7 +7,8 @@ from urllib.parse import quote
 
 from spacy.lang.en import English
 
-from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
+from index import index_titles_urls_and_extracts
+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
 from paths import WIKI_TITLES_PATH, INDEX_PATH
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']

この差分においてかなりの量のファイルが変更されているため、一部のファイルを表示していません