WIP: implement docker image. TODO: copy index and set the correct index path using env var
This commit is contained in:
parent
f754b38f71
commit
9c65bf3c8f
25 changed files with 282 additions and 1241 deletions
33
Dockerfile
Normal file
33
Dockerfile
Normal file
|
@ -0,0 +1,33 @@
|
|||
FROM python:3.9-slim-bullseye as base
|
||||
|
||||
ENV PYTHONFAULTHANDLER=1 \
|
||||
PYTHONHASHSEED=random \
|
||||
PYTHONUNBUFFERED=1
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
FROM base as builder
|
||||
|
||||
ENV PIP_DEFAULT_TIMEOUT=100 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
POETRY_VERSION=1.1.12
|
||||
|
||||
# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
|
||||
RUN pip install "poetry==$POETRY_VERSION"
|
||||
RUN python -m venv /venv
|
||||
|
||||
COPY pyproject.toml poetry.lock ./
|
||||
RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
||||
|
||||
COPY . .
|
||||
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||
|
||||
FROM base as final
|
||||
|
||||
#RUN apk add --no-cache libffi libpq
|
||||
COPY --from=builder /venv /venv
|
||||
#COPY docker-entrypoint.sh wsgi.py ./
|
||||
#CMD ["./docker-entrypoint.sh"]
|
||||
|
||||
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
|
4
README.md
Normal file
4
README.md
Normal file
|
@ -0,0 +1,4 @@
|
|||
Tiny Search Engine
|
||||
==================
|
||||
|
||||
TBD
|
|
@ -1,4 +1,4 @@
|
|||
from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
from paths import INDEX_PATH
|
||||
|
||||
|
||||
|
|
17
app.py
17
app.py
|
@ -1,17 +0,0 @@
|
|||
import logging
|
||||
|
||||
import uvicorn
|
||||
|
||||
import create_app
|
||||
|
||||
from index import TinyIndex, PAGE_SIZE, NUM_PAGES, Document
|
||||
from paths import INDEX_PATH
|
||||
|
||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
app = create_app.create(tiny_index)
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("app:app", host="127.0.0.1", port=8000, log_level="info", reload=True)
|
141
index.py
141
index.py
|
@ -1,26 +1,16 @@
|
|||
"""
|
||||
Create a search index
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass, fields, asdict, astuple
|
||||
from itertools import islice
|
||||
from mmap import mmap, PROT_READ
|
||||
from typing import List, Iterator, TypeVar, Generic, Iterable, Callable
|
||||
from typing import Iterator, Iterable
|
||||
from urllib.parse import unquote
|
||||
|
||||
import justext
|
||||
import mmh3
|
||||
import pandas as pd
|
||||
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
|
||||
|
||||
# NUM_PAGES = 8192
|
||||
# PAGE_SIZE = 512
|
||||
NUM_PAGES = 25600
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
|
||||
|
||||
NUM_INITIAL_TOKENS = 50
|
||||
|
||||
|
@ -42,133 +32,6 @@ def tokenize(nlp, cleaned_text):
|
|||
return lowered
|
||||
|
||||
|
||||
def clean(content):
|
||||
text = justext.justext(content, justext.get_stoplist("English"))
|
||||
pars = [par.text for par in text if not par.is_boilerplate]
|
||||
cleaned_text = ' '.join(pars)
|
||||
return cleaned_text
|
||||
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
title: str
|
||||
url: str
|
||||
extract: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizedDocument(Document):
|
||||
tokens: List[str]
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
class TinyIndexBase(Generic[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
|
||||
self.item_factory = item_factory
|
||||
self.num_pages = num_pages
|
||||
self.page_size = page_size
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.mmap = None
|
||||
|
||||
def retrieve(self, key: str) -> List[T]:
|
||||
index = self._get_key_page_index(key)
|
||||
page = self.get_page(index)
|
||||
if page is None:
|
||||
return []
|
||||
# print("REtrieve", self.index_path, page)
|
||||
return self.convert_items(page)
|
||||
|
||||
def _get_key_page_index(self, key):
|
||||
key_hash = mmh3.hash(key, signed=False)
|
||||
return key_hash % self.num_pages
|
||||
|
||||
def get_page(self, i):
|
||||
"""
|
||||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
|
||||
try:
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
except ZstdError:
|
||||
return None
|
||||
results = json.loads(decompressed_data.decode('utf8'))
|
||||
# print(f"Num results: {len(results)}, num zeros: {zeros}")
|
||||
return results
|
||||
|
||||
def convert_items(self, items) -> List[T]:
|
||||
converted = [self.item_factory(*item) for item in items]
|
||||
# print("Converted", items, converted)
|
||||
return converted
|
||||
|
||||
|
||||
class TinyIndex(TinyIndexBase[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
|
||||
super().__init__(item_factory, num_pages, page_size)
|
||||
# print("REtrieve path", index_path)
|
||||
self.index_path = index_path
|
||||
self.index_file = open(self.index_path, 'rb')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
|
||||
|
||||
|
||||
class TinyIndexer(TinyIndexBase[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
||||
super().__init__(item_factory, num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.compressor = ZstdCompressor()
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.index_file = None
|
||||
self.mmap = None
|
||||
|
||||
def __enter__(self):
|
||||
self.create_if_not_exists()
|
||||
self.index_file = open(self.index_path, 'r+b')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
# print("Index", value)
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
f" ({self.item_factory.__name__})"
|
||||
page_index = self._get_key_page_index(key)
|
||||
current_page = self.get_page(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
value_tuple = astuple(value)
|
||||
# print("Value tuple", value_tuple)
|
||||
current_page.append(value_tuple)
|
||||
try:
|
||||
# print("Page", current_page)
|
||||
self._write_page(current_page, page_index)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _write_page(self, data, i):
|
||||
"""
|
||||
Serialise the data using JSON, compress it and store it at index i.
|
||||
If the data is too big, it will raise a ValueError and not store anything
|
||||
"""
|
||||
serialised_data = json.dumps(data)
|
||||
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
||||
page_length = len(compressed_data)
|
||||
if page_length > self.page_size:
|
||||
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
||||
padding = b'\x00' * (self.page_size - page_length)
|
||||
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
||||
|
||||
def create_if_not_exists(self):
|
||||
if not os.path.isfile(self.index_path):
|
||||
file_length = self.num_pages * self.page_size
|
||||
with open(self.index_path, 'wb') as index_file:
|
||||
index_file.write(b'\x00' * file_length)
|
||||
|
||||
|
||||
def prepare_url_for_tokenizing(url: str):
|
||||
if url.startswith(HTTP_START):
|
||||
url = url[len(HTTP_START):]
|
||||
|
|
|
@ -4,7 +4,8 @@ from glob import glob
|
|||
import bs4
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
|
||||
from index import tokenize
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from paths import INDEX_PATH, CRAWL_GLOB
|
||||
|
||||
|
||||
|
@ -36,3 +37,10 @@ def run():
|
|||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
||||
|
||||
def clean(content):
|
||||
text = justext.justext(content, justext.get_stoplist("English"))
|
||||
pars = [par.text for par in text if not par.is_boilerplate]
|
||||
cleaned_text = ' '.join(pars)
|
||||
return cleaned_text
|
|
@ -4,7 +4,8 @@ Index items in the file-system queue
|
|||
from spacy.lang.en import English
|
||||
|
||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
|
||||
from index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,8 @@ from logging import getLogger
|
|||
import spacy
|
||||
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
|
||||
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
|
||||
from index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
|
||||
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
||||
|
||||
|
||||
|
|
|
@ -8,9 +8,10 @@ import numpy as np
|
|||
from spacy.lang.en import English
|
||||
from starlette.testclient import TestClient
|
||||
|
||||
import create_app
|
||||
from tinysearchengine import create_app
|
||||
from fsqueue import ZstdJsonSerializer
|
||||
from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
|
||||
from index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
|
|
1096
poetry.lock
generated
1096
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -6,26 +6,26 @@ authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.9"
|
||||
botocore = "^1.23.20"
|
||||
boto3 = "^1.20.20"
|
||||
ujson = "^4.3.0"
|
||||
warcio = "^1.7.4"
|
||||
idna = "^3.3"
|
||||
beautifulsoup4 = "^4.10.0"
|
||||
lxml = "^4.6.4"
|
||||
jusText = "^3.0.0"
|
||||
# botocore = "^1.23.20"
|
||||
# boto3 = "^1.20.20"
|
||||
# ujson = "^4.3.0"
|
||||
# warcio = "^1.7.4"
|
||||
# idna = "^3.3"
|
||||
# beautifulsoup4 = "^4.10.0"
|
||||
# lxml = "^4.6.4"
|
||||
# jusText = "^3.0.0"
|
||||
pandas = "^1.3.4"
|
||||
pyspark = "^3.2.0"
|
||||
langdetect = "^1.0.9"
|
||||
# pyspark = "^3.2.0"
|
||||
# langdetect = "^1.0.9"
|
||||
zstandard = "^0.16.0"
|
||||
spacy = "^3.2.1"
|
||||
# spacy = "^3.2.1"
|
||||
mmh3 = "^3.0.0"
|
||||
fastapi = "^0.70.1"
|
||||
Levenshtein = "^0.16.0"
|
||||
# Levenshtein = "^0.16.0"
|
||||
uvicorn = "^0.16.0"
|
||||
|
||||
[tool.poetry.dependencies.en_core_web_sm]
|
||||
url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
|
||||
# [tool.poetry.dependencies.en_core_web_sm]
|
||||
# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
|
||||
|
|
24
setup.cfg
Normal file
24
setup.cfg
Normal file
|
@ -0,0 +1,24 @@
|
|||
[metadata]
|
||||
name = tiny-search-engine-daoudc
|
||||
version = 0.0.1
|
||||
author = Daoud Clarke
|
||||
author_email = daoud.clarke@gmail.com
|
||||
description = Tiny Search Engine
|
||||
long_description = file: README.md
|
||||
long_description_content_type = text/markdown
|
||||
# url = https://github.com/pypa/sampleproject
|
||||
# project_urls =
|
||||
# Bug Tracker = https://github.com/pypa/sampleproject/issues
|
||||
# classifiers =
|
||||
# Programming Language :: Python :: 3
|
||||
# License :: OSI Approved :: MIT License
|
||||
# Operating System :: OS Independent
|
||||
|
||||
[options]
|
||||
package_dir =
|
||||
= src
|
||||
packages = find:
|
||||
python_requires = >=3.9
|
||||
|
||||
[options.packages.find]
|
||||
where = src
|
0
tinysearchengine/__init__.py
Normal file
0
tinysearchengine/__init__.py
Normal file
17
tinysearchengine/app.py
Normal file
17
tinysearchengine/app.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import logging
|
||||
|
||||
import uvicorn
|
||||
|
||||
from tinysearchengine import create_app
|
||||
|
||||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
from paths import INDEX_PATH
|
||||
|
||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
app = create_app.create(tiny_index)
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
|
|
@ -6,7 +6,7 @@ from fastapi import FastAPI
|
|||
from starlette.responses import FileResponse
|
||||
from starlette.staticfiles import StaticFiles
|
||||
|
||||
from index import TinyIndex, Document
|
||||
from tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -107,7 +107,7 @@ def create(tiny_index: TinyIndex):
|
|||
|
||||
@app.get('/')
|
||||
def index():
|
||||
return FileResponse('static/index.html')
|
||||
return FileResponse('tinysearchengine/static/index.html')
|
||||
|
||||
app.mount('/', StaticFiles(directory="static"), name="static")
|
||||
app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
|
||||
return app
|
131
tinysearchengine/indexer.py
Normal file
131
tinysearchengine/indexer.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
import json
|
||||
import os
|
||||
from dataclasses import astuple, dataclass
|
||||
from mmap import mmap, PROT_READ
|
||||
from typing import TypeVar, Generic, Callable, List
|
||||
|
||||
import mmh3
|
||||
from zstandard import ZstdDecompressor
|
||||
|
||||
NUM_PAGES = 25600
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
@dataclass
|
||||
class Document:
|
||||
title: str
|
||||
url: str
|
||||
extract: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizedDocument(Document):
|
||||
tokens: List[str]
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
class TinyIndexBase(Generic[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
|
||||
self.item_factory = item_factory
|
||||
self.num_pages = num_pages
|
||||
self.page_size = page_size
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.mmap = None
|
||||
|
||||
def retrieve(self, key: str) -> List[T]:
|
||||
index = self._get_key_page_index(key)
|
||||
page = self.get_page(index)
|
||||
if page is None:
|
||||
return []
|
||||
# print("REtrieve", self.index_path, page)
|
||||
return self.convert_items(page)
|
||||
|
||||
def _get_key_page_index(self, key):
|
||||
key_hash = mmh3.hash(key, signed=False)
|
||||
return key_hash % self.num_pages
|
||||
|
||||
def get_page(self, i):
|
||||
"""
|
||||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
|
||||
try:
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
except ZstdError:
|
||||
return None
|
||||
results = json.loads(decompressed_data.decode('utf8'))
|
||||
# print(f"Num results: {len(results)}, num zeros: {zeros}")
|
||||
return results
|
||||
|
||||
def convert_items(self, items) -> List[T]:
|
||||
converted = [self.item_factory(*item) for item in items]
|
||||
# print("Converted", items, converted)
|
||||
return converted
|
||||
|
||||
|
||||
class TinyIndex(TinyIndexBase[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
|
||||
super().__init__(item_factory, num_pages, page_size)
|
||||
# print("REtrieve path", index_path)
|
||||
self.index_path = index_path
|
||||
self.index_file = open(self.index_path, 'rb')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
|
||||
|
||||
|
||||
class TinyIndexer(TinyIndexBase[T]):
|
||||
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
||||
super().__init__(item_factory, num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.compressor = ZstdCompressor()
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.index_file = None
|
||||
self.mmap = None
|
||||
|
||||
def __enter__(self):
|
||||
self.create_if_not_exists()
|
||||
self.index_file = open(self.index_path, 'r+b')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
# print("Index", value)
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
f" ({self.item_factory.__name__})"
|
||||
page_index = self._get_key_page_index(key)
|
||||
current_page = self.get_page(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
value_tuple = astuple(value)
|
||||
# print("Value tuple", value_tuple)
|
||||
current_page.append(value_tuple)
|
||||
try:
|
||||
# print("Page", current_page)
|
||||
self._write_page(current_page, page_index)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _write_page(self, data, i):
|
||||
"""
|
||||
Serialise the data using JSON, compress it and store it at index i.
|
||||
If the data is too big, it will raise a ValueError and not store anything
|
||||
"""
|
||||
serialised_data = json.dumps(data)
|
||||
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
||||
page_length = len(compressed_data)
|
||||
if page_length > self.page_size:
|
||||
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
||||
padding = b'\x00' * (self.page_size - page_length)
|
||||
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
||||
|
||||
def create_if_not_exists(self):
|
||||
if not os.path.isfile(self.index_path):
|
||||
file_length = self.num_pages * self.page_size
|
||||
with open(self.index_path, 'wb') as index_file:
|
||||
index_file.write(b'\x00' * file_length)
|
3
wiki.py
3
wiki.py
|
@ -7,7 +7,8 @@ from urllib.parse import quote
|
|||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
|
||||
from index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
|
|
Loading…
Reference in a new issue