WIP: implement docker image. TODO: copy index and set the correct index path using env var

This commit is contained in:
Daoud Clarke 2021-12-22 23:21:23 +00:00
parent f754b38f71
commit 9c65bf3c8f
25 changed files with 282 additions and 1241 deletions

33
Dockerfile Normal file
View file

@ -0,0 +1,33 @@
FROM python:3.9-slim-bullseye as base
ENV PYTHONFAULTHANDLER=1 \
PYTHONHASHSEED=random \
PYTHONUNBUFFERED=1
WORKDIR /app
FROM base as builder
ENV PIP_DEFAULT_TIMEOUT=100 \
PIP_DISABLE_PIP_VERSION_CHECK=1 \
PIP_NO_CACHE_DIR=1 \
POETRY_VERSION=1.1.12
# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
RUN pip install "poetry==$POETRY_VERSION"
RUN python -m venv /venv
COPY pyproject.toml poetry.lock ./
RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
COPY . .
RUN poetry build && /venv/bin/pip install dist/*.whl
FROM base as final
#RUN apk add --no-cache libffi libpq
COPY --from=builder /venv /venv
#COPY docker-entrypoint.sh wsgi.py ./
#CMD ["./docker-entrypoint.sh"]
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]

4
README.md Normal file
View file

@ -0,0 +1,4 @@
Tiny Search Engine
==================
TBD

View file

@ -1,4 +1,4 @@
from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH from paths import INDEX_PATH

17
app.py
View file

@ -1,17 +0,0 @@
import logging
import uvicorn
import create_app
from index import TinyIndex, PAGE_SIZE, NUM_PAGES, Document
from paths import INDEX_PATH
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
logging.basicConfig()
if __name__ == "__main__":
uvicorn.run("app:app", host="127.0.0.1", port=8000, log_level="info", reload=True)

141
index.py
View file

@ -1,26 +1,16 @@
""" """
Create a search index Create a search index
""" """
import json
import os
from abc import ABC, abstractmethod
from collections import Counter from collections import Counter
from dataclasses import dataclass, fields, asdict, astuple
from itertools import islice from itertools import islice
from mmap import mmap, PROT_READ from typing import Iterator, Iterable
from typing import List, Iterator, TypeVar, Generic, Iterable, Callable
from urllib.parse import unquote from urllib.parse import unquote
import justext
import mmh3
import pandas as pd import pandas as pd
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
# NUM_PAGES = 8192 # NUM_PAGES = 8192
# PAGE_SIZE = 512 # PAGE_SIZE = 512
NUM_PAGES = 25600 from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
PAGE_SIZE = 4096
NUM_INITIAL_TOKENS = 50 NUM_INITIAL_TOKENS = 50
@ -42,133 +32,6 @@ def tokenize(nlp, cleaned_text):
return lowered return lowered
def clean(content):
text = justext.justext(content, justext.get_stoplist("English"))
pars = [par.text for par in text if not par.is_boilerplate]
cleaned_text = ' '.join(pars)
return cleaned_text
@dataclass
class Document:
title: str
url: str
extract: str
@dataclass
class TokenizedDocument(Document):
tokens: List[str]
T = TypeVar('T')
class TinyIndexBase(Generic[T]):
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
self.item_factory = item_factory
self.num_pages = num_pages
self.page_size = page_size
self.decompressor = ZstdDecompressor()
self.mmap = None
def retrieve(self, key: str) -> List[T]:
index = self._get_key_page_index(key)
page = self.get_page(index)
if page is None:
return []
# print("REtrieve", self.index_path, page)
return self.convert_items(page)
def _get_key_page_index(self, key):
key_hash = mmh3.hash(key, signed=False)
return key_hash % self.num_pages
def get_page(self, i):
"""
Get the page at index i, decompress and deserialise it using JSON
"""
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
try:
decompressed_data = self.decompressor.decompress(page_data)
except ZstdError:
return None
results = json.loads(decompressed_data.decode('utf8'))
# print(f"Num results: {len(results)}, num zeros: {zeros}")
return results
def convert_items(self, items) -> List[T]:
converted = [self.item_factory(*item) for item in items]
# print("Converted", items, converted)
return converted
class TinyIndex(TinyIndexBase[T]):
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
super().__init__(item_factory, num_pages, page_size)
# print("REtrieve path", index_path)
self.index_path = index_path
self.index_file = open(self.index_path, 'rb')
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
class TinyIndexer(TinyIndexBase[T]):
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
super().__init__(item_factory, num_pages, page_size)
self.index_path = index_path
self.compressor = ZstdCompressor()
self.decompressor = ZstdDecompressor()
self.index_file = None
self.mmap = None
def __enter__(self):
self.create_if_not_exists()
self.index_file = open(self.index_path, 'r+b')
self.mmap = mmap(self.index_file.fileno(), 0)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.mmap.close()
self.index_file.close()
def index(self, key: str, value: T):
# print("Index", value)
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self._get_key_page_index(key)
current_page = self.get_page(page_index)
if current_page is None:
current_page = []
value_tuple = astuple(value)
# print("Value tuple", value_tuple)
current_page.append(value_tuple)
try:
# print("Page", current_page)
self._write_page(current_page, page_index)
except ValueError:
pass
def _write_page(self, data, i):
"""
Serialise the data using JSON, compress it and store it at index i.
If the data is too big, it will raise a ValueError and not store anything
"""
serialised_data = json.dumps(data)
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
page_length = len(compressed_data)
if page_length > self.page_size:
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
padding = b'\x00' * (self.page_size - page_length)
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
def create_if_not_exists(self):
if not os.path.isfile(self.index_path):
file_length = self.num_pages * self.page_size
with open(self.index_path, 'wb') as index_file:
index_file.write(b'\x00' * file_length)
def prepare_url_for_tokenizing(url: str): def prepare_url_for_tokenizing(url: str):
if url.startswith(HTTP_START): if url.startswith(HTTP_START):
url = url[len(HTTP_START):] url = url[len(HTTP_START):]

View file

@ -4,7 +4,8 @@ from glob import glob
import bs4 import bs4
from spacy.lang.en import English from spacy.lang.en import English
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize from index import tokenize
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from paths import INDEX_PATH, CRAWL_GLOB from paths import INDEX_PATH, CRAWL_GLOB
@ -36,3 +37,10 @@ def run():
if __name__ == '__main__': if __name__ == '__main__':
run() run()
def clean(content):
text = justext.justext(content, justext.get_stoplist("English"))
pars = [par.text for par in text if not par.is_boilerplate]
cleaned_text = ' '.join(pars)
return cleaned_text

View file

@ -4,7 +4,8 @@ Index items in the file-system queue
from spacy.lang.en import English from spacy.lang.en import English
from fsqueue import FSQueue, ZstdJsonSerializer from fsqueue import FSQueue, ZstdJsonSerializer
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts from index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH

View file

@ -8,7 +8,8 @@ from logging import getLogger
import spacy import spacy
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document from index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH

View file

@ -8,9 +8,10 @@ import numpy as np
from spacy.lang.en import English from spacy.lang.en import English
from starlette.testclient import TestClient from starlette.testclient import TestClient
import create_app from tinysearchengine import create_app
from fsqueue import ZstdJsonSerializer from fsqueue import ZstdJsonSerializer
from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex from index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
NUM_DOCUMENTS = 30000 NUM_DOCUMENTS = 30000

1096
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -6,26 +6,26 @@ authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.9" python = "^3.9"
botocore = "^1.23.20" # botocore = "^1.23.20"
boto3 = "^1.20.20" # boto3 = "^1.20.20"
ujson = "^4.3.0" # ujson = "^4.3.0"
warcio = "^1.7.4" # warcio = "^1.7.4"
idna = "^3.3" # idna = "^3.3"
beautifulsoup4 = "^4.10.0" # beautifulsoup4 = "^4.10.0"
lxml = "^4.6.4" # lxml = "^4.6.4"
jusText = "^3.0.0" # jusText = "^3.0.0"
pandas = "^1.3.4" pandas = "^1.3.4"
pyspark = "^3.2.0" # pyspark = "^3.2.0"
langdetect = "^1.0.9" # langdetect = "^1.0.9"
zstandard = "^0.16.0" zstandard = "^0.16.0"
spacy = "^3.2.1" # spacy = "^3.2.1"
mmh3 = "^3.0.0" mmh3 = "^3.0.0"
fastapi = "^0.70.1" fastapi = "^0.70.1"
Levenshtein = "^0.16.0" # Levenshtein = "^0.16.0"
uvicorn = "^0.16.0" uvicorn = "^0.16.0"
[tool.poetry.dependencies.en_core_web_sm] # [tool.poetry.dependencies.en_core_web_sm]
url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl" # url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
[tool.poetry.dev-dependencies] [tool.poetry.dev-dependencies]

24
setup.cfg Normal file
View file

@ -0,0 +1,24 @@
[metadata]
name = tiny-search-engine-daoudc
version = 0.0.1
author = Daoud Clarke
author_email = daoud.clarke@gmail.com
description = Tiny Search Engine
long_description = file: README.md
long_description_content_type = text/markdown
# url = https://github.com/pypa/sampleproject
# project_urls =
# Bug Tracker = https://github.com/pypa/sampleproject/issues
# classifiers =
# Programming Language :: Python :: 3
# License :: OSI Approved :: MIT License
# Operating System :: OS Independent
[options]
package_dir =
= src
packages = find:
python_requires = >=3.9
[options.packages.find]
where = src

View file

17
tinysearchengine/app.py Normal file
View file

@ -0,0 +1,17 @@
import logging
import uvicorn
from tinysearchengine import create_app
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
logging.basicConfig()
if __name__ == "__main__":
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)

View file

@ -6,7 +6,7 @@ from fastapi import FastAPI
from starlette.responses import FileResponse from starlette.responses import FileResponse
from starlette.staticfiles import StaticFiles from starlette.staticfiles import StaticFiles
from index import TinyIndex, Document from tinysearchengine.indexer import TinyIndex, Document
logger = getLogger(__name__) logger = getLogger(__name__)
@ -107,7 +107,7 @@ def create(tiny_index: TinyIndex):
@app.get('/') @app.get('/')
def index(): def index():
return FileResponse('static/index.html') return FileResponse('tinysearchengine/static/index.html')
app.mount('/', StaticFiles(directory="static"), name="static") app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
return app return app

131
tinysearchengine/indexer.py Normal file
View file

@ -0,0 +1,131 @@
import json
import os
from dataclasses import astuple, dataclass
from mmap import mmap, PROT_READ
from typing import TypeVar, Generic, Callable, List
import mmh3
from zstandard import ZstdDecompressor
NUM_PAGES = 25600
PAGE_SIZE = 4096
@dataclass
class Document:
title: str
url: str
extract: str
@dataclass
class TokenizedDocument(Document):
tokens: List[str]
T = TypeVar('T')
class TinyIndexBase(Generic[T]):
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
self.item_factory = item_factory
self.num_pages = num_pages
self.page_size = page_size
self.decompressor = ZstdDecompressor()
self.mmap = None
def retrieve(self, key: str) -> List[T]:
index = self._get_key_page_index(key)
page = self.get_page(index)
if page is None:
return []
# print("REtrieve", self.index_path, page)
return self.convert_items(page)
def _get_key_page_index(self, key):
key_hash = mmh3.hash(key, signed=False)
return key_hash % self.num_pages
def get_page(self, i):
"""
Get the page at index i, decompress and deserialise it using JSON
"""
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
try:
decompressed_data = self.decompressor.decompress(page_data)
except ZstdError:
return None
results = json.loads(decompressed_data.decode('utf8'))
# print(f"Num results: {len(results)}, num zeros: {zeros}")
return results
def convert_items(self, items) -> List[T]:
converted = [self.item_factory(*item) for item in items]
# print("Converted", items, converted)
return converted
class TinyIndex(TinyIndexBase[T]):
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
super().__init__(item_factory, num_pages, page_size)
# print("REtrieve path", index_path)
self.index_path = index_path
self.index_file = open(self.index_path, 'rb')
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
class TinyIndexer(TinyIndexBase[T]):
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
super().__init__(item_factory, num_pages, page_size)
self.index_path = index_path
self.compressor = ZstdCompressor()
self.decompressor = ZstdDecompressor()
self.index_file = None
self.mmap = None
def __enter__(self):
self.create_if_not_exists()
self.index_file = open(self.index_path, 'r+b')
self.mmap = mmap(self.index_file.fileno(), 0)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.mmap.close()
self.index_file.close()
def index(self, key: str, value: T):
# print("Index", value)
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self._get_key_page_index(key)
current_page = self.get_page(page_index)
if current_page is None:
current_page = []
value_tuple = astuple(value)
# print("Value tuple", value_tuple)
current_page.append(value_tuple)
try:
# print("Page", current_page)
self._write_page(current_page, page_index)
except ValueError:
pass
def _write_page(self, data, i):
"""
Serialise the data using JSON, compress it and store it at index i.
If the data is too big, it will raise a ValueError and not store anything
"""
serialised_data = json.dumps(data)
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
page_length = len(compressed_data)
if page_length > self.page_size:
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
padding = b'\x00' * (self.page_size - page_length)
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
def create_if_not_exists(self):
if not os.path.isfile(self.index_path):
file_length = self.num_pages * self.page_size
with open(self.index_path, 'wb') as index_file:
index_file.write(b'\x00' * file_length)

View file

@ -7,7 +7,8 @@ from urllib.parse import quote
from spacy.lang.en import English from spacy.lang.en import English
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES from index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from paths import WIKI_TITLES_PATH, INDEX_PATH from paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']