WIP: implement docker image. TODO: copy index and set the correct index path using env var
This commit is contained in:
parent
f754b38f71
commit
9c65bf3c8f
25 changed files with 282 additions and 1241 deletions
33
Dockerfile
Normal file
33
Dockerfile
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
FROM python:3.9-slim-bullseye as base
|
||||||
|
|
||||||
|
ENV PYTHONFAULTHANDLER=1 \
|
||||||
|
PYTHONHASHSEED=random \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
FROM base as builder
|
||||||
|
|
||||||
|
ENV PIP_DEFAULT_TIMEOUT=100 \
|
||||||
|
PIP_DISABLE_PIP_VERSION_CHECK=1 \
|
||||||
|
PIP_NO_CACHE_DIR=1 \
|
||||||
|
POETRY_VERSION=1.1.12
|
||||||
|
|
||||||
|
# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
|
||||||
|
RUN pip install "poetry==$POETRY_VERSION"
|
||||||
|
RUN python -m venv /venv
|
||||||
|
|
||||||
|
COPY pyproject.toml poetry.lock ./
|
||||||
|
RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
RUN poetry build && /venv/bin/pip install dist/*.whl
|
||||||
|
|
||||||
|
FROM base as final
|
||||||
|
|
||||||
|
#RUN apk add --no-cache libffi libpq
|
||||||
|
COPY --from=builder /venv /venv
|
||||||
|
#COPY docker-entrypoint.sh wsgi.py ./
|
||||||
|
#CMD ["./docker-entrypoint.sh"]
|
||||||
|
|
||||||
|
CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
|
4
README.md
Normal file
4
README.md
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
Tiny Search Engine
|
||||||
|
==================
|
||||||
|
|
||||||
|
TBD
|
|
@ -1,4 +1,4 @@
|
||||||
from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||||
from paths import INDEX_PATH
|
from paths import INDEX_PATH
|
||||||
|
|
||||||
|
|
||||||
|
|
17
app.py
17
app.py
|
@ -1,17 +0,0 @@
|
||||||
import logging
|
|
||||||
|
|
||||||
import uvicorn
|
|
||||||
|
|
||||||
import create_app
|
|
||||||
|
|
||||||
from index import TinyIndex, PAGE_SIZE, NUM_PAGES, Document
|
|
||||||
from paths import INDEX_PATH
|
|
||||||
|
|
||||||
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
|
||||||
app = create_app.create(tiny_index)
|
|
||||||
|
|
||||||
logging.basicConfig()
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
uvicorn.run("app:app", host="127.0.0.1", port=8000, log_level="info", reload=True)
|
|
141
index.py
141
index.py
|
@ -1,26 +1,16 @@
|
||||||
"""
|
"""
|
||||||
Create a search index
|
Create a search index
|
||||||
"""
|
"""
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
from dataclasses import dataclass, fields, asdict, astuple
|
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from mmap import mmap, PROT_READ
|
from typing import Iterator, Iterable
|
||||||
from typing import List, Iterator, TypeVar, Generic, Iterable, Callable
|
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import justext
|
|
||||||
import mmh3
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
|
|
||||||
|
|
||||||
# NUM_PAGES = 8192
|
# NUM_PAGES = 8192
|
||||||
# PAGE_SIZE = 512
|
# PAGE_SIZE = 512
|
||||||
NUM_PAGES = 25600
|
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
|
||||||
PAGE_SIZE = 4096
|
|
||||||
|
|
||||||
|
|
||||||
NUM_INITIAL_TOKENS = 50
|
NUM_INITIAL_TOKENS = 50
|
||||||
|
|
||||||
|
@ -42,133 +32,6 @@ def tokenize(nlp, cleaned_text):
|
||||||
return lowered
|
return lowered
|
||||||
|
|
||||||
|
|
||||||
def clean(content):
|
|
||||||
text = justext.justext(content, justext.get_stoplist("English"))
|
|
||||||
pars = [par.text for par in text if not par.is_boilerplate]
|
|
||||||
cleaned_text = ' '.join(pars)
|
|
||||||
return cleaned_text
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Document:
|
|
||||||
title: str
|
|
||||||
url: str
|
|
||||||
extract: str
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TokenizedDocument(Document):
|
|
||||||
tokens: List[str]
|
|
||||||
|
|
||||||
|
|
||||||
T = TypeVar('T')
|
|
||||||
|
|
||||||
|
|
||||||
class TinyIndexBase(Generic[T]):
|
|
||||||
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
|
|
||||||
self.item_factory = item_factory
|
|
||||||
self.num_pages = num_pages
|
|
||||||
self.page_size = page_size
|
|
||||||
self.decompressor = ZstdDecompressor()
|
|
||||||
self.mmap = None
|
|
||||||
|
|
||||||
def retrieve(self, key: str) -> List[T]:
|
|
||||||
index = self._get_key_page_index(key)
|
|
||||||
page = self.get_page(index)
|
|
||||||
if page is None:
|
|
||||||
return []
|
|
||||||
# print("REtrieve", self.index_path, page)
|
|
||||||
return self.convert_items(page)
|
|
||||||
|
|
||||||
def _get_key_page_index(self, key):
|
|
||||||
key_hash = mmh3.hash(key, signed=False)
|
|
||||||
return key_hash % self.num_pages
|
|
||||||
|
|
||||||
def get_page(self, i):
|
|
||||||
"""
|
|
||||||
Get the page at index i, decompress and deserialise it using JSON
|
|
||||||
"""
|
|
||||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
|
||||||
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
|
|
||||||
try:
|
|
||||||
decompressed_data = self.decompressor.decompress(page_data)
|
|
||||||
except ZstdError:
|
|
||||||
return None
|
|
||||||
results = json.loads(decompressed_data.decode('utf8'))
|
|
||||||
# print(f"Num results: {len(results)}, num zeros: {zeros}")
|
|
||||||
return results
|
|
||||||
|
|
||||||
def convert_items(self, items) -> List[T]:
|
|
||||||
converted = [self.item_factory(*item) for item in items]
|
|
||||||
# print("Converted", items, converted)
|
|
||||||
return converted
|
|
||||||
|
|
||||||
|
|
||||||
class TinyIndex(TinyIndexBase[T]):
|
|
||||||
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
|
|
||||||
super().__init__(item_factory, num_pages, page_size)
|
|
||||||
# print("REtrieve path", index_path)
|
|
||||||
self.index_path = index_path
|
|
||||||
self.index_file = open(self.index_path, 'rb')
|
|
||||||
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
|
|
||||||
|
|
||||||
|
|
||||||
class TinyIndexer(TinyIndexBase[T]):
|
|
||||||
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
|
||||||
super().__init__(item_factory, num_pages, page_size)
|
|
||||||
self.index_path = index_path
|
|
||||||
self.compressor = ZstdCompressor()
|
|
||||||
self.decompressor = ZstdDecompressor()
|
|
||||||
self.index_file = None
|
|
||||||
self.mmap = None
|
|
||||||
|
|
||||||
def __enter__(self):
|
|
||||||
self.create_if_not_exists()
|
|
||||||
self.index_file = open(self.index_path, 'r+b')
|
|
||||||
self.mmap = mmap(self.index_file.fileno(), 0)
|
|
||||||
return self
|
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
||||||
self.mmap.close()
|
|
||||||
self.index_file.close()
|
|
||||||
|
|
||||||
def index(self, key: str, value: T):
|
|
||||||
# print("Index", value)
|
|
||||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
|
||||||
f" ({self.item_factory.__name__})"
|
|
||||||
page_index = self._get_key_page_index(key)
|
|
||||||
current_page = self.get_page(page_index)
|
|
||||||
if current_page is None:
|
|
||||||
current_page = []
|
|
||||||
value_tuple = astuple(value)
|
|
||||||
# print("Value tuple", value_tuple)
|
|
||||||
current_page.append(value_tuple)
|
|
||||||
try:
|
|
||||||
# print("Page", current_page)
|
|
||||||
self._write_page(current_page, page_index)
|
|
||||||
except ValueError:
|
|
||||||
pass
|
|
||||||
|
|
||||||
def _write_page(self, data, i):
|
|
||||||
"""
|
|
||||||
Serialise the data using JSON, compress it and store it at index i.
|
|
||||||
If the data is too big, it will raise a ValueError and not store anything
|
|
||||||
"""
|
|
||||||
serialised_data = json.dumps(data)
|
|
||||||
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
|
||||||
page_length = len(compressed_data)
|
|
||||||
if page_length > self.page_size:
|
|
||||||
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
|
||||||
padding = b'\x00' * (self.page_size - page_length)
|
|
||||||
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
|
||||||
|
|
||||||
def create_if_not_exists(self):
|
|
||||||
if not os.path.isfile(self.index_path):
|
|
||||||
file_length = self.num_pages * self.page_size
|
|
||||||
with open(self.index_path, 'wb') as index_file:
|
|
||||||
index_file.write(b'\x00' * file_length)
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_url_for_tokenizing(url: str):
|
def prepare_url_for_tokenizing(url: str):
|
||||||
if url.startswith(HTTP_START):
|
if url.startswith(HTTP_START):
|
||||||
url = url[len(HTTP_START):]
|
url = url[len(HTTP_START):]
|
||||||
|
|
|
@ -4,7 +4,8 @@ from glob import glob
|
||||||
import bs4
|
import bs4
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
|
from index import tokenize
|
||||||
|
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||||
from paths import INDEX_PATH, CRAWL_GLOB
|
from paths import INDEX_PATH, CRAWL_GLOB
|
||||||
|
|
||||||
|
|
||||||
|
@ -36,3 +37,10 @@ def run():
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
run()
|
run()
|
||||||
|
|
||||||
|
|
||||||
|
def clean(content):
|
||||||
|
text = justext.justext(content, justext.get_stoplist("English"))
|
||||||
|
pars = [par.text for par in text if not par.is_boilerplate]
|
||||||
|
cleaned_text = ' '.join(pars)
|
||||||
|
return cleaned_text
|
|
@ -4,7 +4,8 @@ Index items in the file-system queue
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
from fsqueue import FSQueue, ZstdJsonSerializer
|
||||||
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
|
from index import index_titles_urls_and_extracts
|
||||||
|
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||||
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
|
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,7 +8,8 @@ from logging import getLogger
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
|
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
|
||||||
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
|
from index import index_titles_urls_and_extracts
|
||||||
|
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
|
||||||
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -8,9 +8,10 @@ import numpy as np
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
from starlette.testclient import TestClient
|
from starlette.testclient import TestClient
|
||||||
|
|
||||||
import create_app
|
from tinysearchengine import create_app
|
||||||
from fsqueue import ZstdJsonSerializer
|
from fsqueue import ZstdJsonSerializer
|
||||||
from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
|
from index import index_titles_urls_and_extracts
|
||||||
|
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||||
|
|
||||||
NUM_DOCUMENTS = 30000
|
NUM_DOCUMENTS = 30000
|
||||||
|
|
1096
poetry.lock
generated
1096
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -6,26 +6,26 @@ authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
botocore = "^1.23.20"
|
# botocore = "^1.23.20"
|
||||||
boto3 = "^1.20.20"
|
# boto3 = "^1.20.20"
|
||||||
ujson = "^4.3.0"
|
# ujson = "^4.3.0"
|
||||||
warcio = "^1.7.4"
|
# warcio = "^1.7.4"
|
||||||
idna = "^3.3"
|
# idna = "^3.3"
|
||||||
beautifulsoup4 = "^4.10.0"
|
# beautifulsoup4 = "^4.10.0"
|
||||||
lxml = "^4.6.4"
|
# lxml = "^4.6.4"
|
||||||
jusText = "^3.0.0"
|
# jusText = "^3.0.0"
|
||||||
pandas = "^1.3.4"
|
pandas = "^1.3.4"
|
||||||
pyspark = "^3.2.0"
|
# pyspark = "^3.2.0"
|
||||||
langdetect = "^1.0.9"
|
# langdetect = "^1.0.9"
|
||||||
zstandard = "^0.16.0"
|
zstandard = "^0.16.0"
|
||||||
spacy = "^3.2.1"
|
# spacy = "^3.2.1"
|
||||||
mmh3 = "^3.0.0"
|
mmh3 = "^3.0.0"
|
||||||
fastapi = "^0.70.1"
|
fastapi = "^0.70.1"
|
||||||
Levenshtein = "^0.16.0"
|
# Levenshtein = "^0.16.0"
|
||||||
uvicorn = "^0.16.0"
|
uvicorn = "^0.16.0"
|
||||||
|
|
||||||
[tool.poetry.dependencies.en_core_web_sm]
|
# [tool.poetry.dependencies.en_core_web_sm]
|
||||||
url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
|
# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
|
|
||||||
|
|
24
setup.cfg
Normal file
24
setup.cfg
Normal file
|
@ -0,0 +1,24 @@
|
||||||
|
[metadata]
|
||||||
|
name = tiny-search-engine-daoudc
|
||||||
|
version = 0.0.1
|
||||||
|
author = Daoud Clarke
|
||||||
|
author_email = daoud.clarke@gmail.com
|
||||||
|
description = Tiny Search Engine
|
||||||
|
long_description = file: README.md
|
||||||
|
long_description_content_type = text/markdown
|
||||||
|
# url = https://github.com/pypa/sampleproject
|
||||||
|
# project_urls =
|
||||||
|
# Bug Tracker = https://github.com/pypa/sampleproject/issues
|
||||||
|
# classifiers =
|
||||||
|
# Programming Language :: Python :: 3
|
||||||
|
# License :: OSI Approved :: MIT License
|
||||||
|
# Operating System :: OS Independent
|
||||||
|
|
||||||
|
[options]
|
||||||
|
package_dir =
|
||||||
|
= src
|
||||||
|
packages = find:
|
||||||
|
python_requires = >=3.9
|
||||||
|
|
||||||
|
[options.packages.find]
|
||||||
|
where = src
|
0
tinysearchengine/__init__.py
Normal file
0
tinysearchengine/__init__.py
Normal file
17
tinysearchengine/app.py
Normal file
17
tinysearchengine/app.py
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
from tinysearchengine import create_app
|
||||||
|
|
||||||
|
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||||
|
from paths import INDEX_PATH
|
||||||
|
|
||||||
|
tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||||
|
app = create_app.create(tiny_index)
|
||||||
|
|
||||||
|
logging.basicConfig()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
|
|
@ -6,7 +6,7 @@ from fastapi import FastAPI
|
||||||
from starlette.responses import FileResponse
|
from starlette.responses import FileResponse
|
||||||
from starlette.staticfiles import StaticFiles
|
from starlette.staticfiles import StaticFiles
|
||||||
|
|
||||||
from index import TinyIndex, Document
|
from tinysearchengine.indexer import TinyIndex, Document
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ def create(tiny_index: TinyIndex):
|
||||||
|
|
||||||
@app.get('/')
|
@app.get('/')
|
||||||
def index():
|
def index():
|
||||||
return FileResponse('static/index.html')
|
return FileResponse('tinysearchengine/static/index.html')
|
||||||
|
|
||||||
app.mount('/', StaticFiles(directory="static"), name="static")
|
app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
|
||||||
return app
|
return app
|
131
tinysearchengine/indexer.py
Normal file
131
tinysearchengine/indexer.py
Normal file
|
@ -0,0 +1,131 @@
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from dataclasses import astuple, dataclass
|
||||||
|
from mmap import mmap, PROT_READ
|
||||||
|
from typing import TypeVar, Generic, Callable, List
|
||||||
|
|
||||||
|
import mmh3
|
||||||
|
from zstandard import ZstdDecompressor
|
||||||
|
|
||||||
|
NUM_PAGES = 25600
|
||||||
|
PAGE_SIZE = 4096
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Document:
|
||||||
|
title: str
|
||||||
|
url: str
|
||||||
|
extract: str
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class TokenizedDocument(Document):
|
||||||
|
tokens: List[str]
|
||||||
|
|
||||||
|
|
||||||
|
T = TypeVar('T')
|
||||||
|
|
||||||
|
|
||||||
|
class TinyIndexBase(Generic[T]):
|
||||||
|
def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
|
||||||
|
self.item_factory = item_factory
|
||||||
|
self.num_pages = num_pages
|
||||||
|
self.page_size = page_size
|
||||||
|
self.decompressor = ZstdDecompressor()
|
||||||
|
self.mmap = None
|
||||||
|
|
||||||
|
def retrieve(self, key: str) -> List[T]:
|
||||||
|
index = self._get_key_page_index(key)
|
||||||
|
page = self.get_page(index)
|
||||||
|
if page is None:
|
||||||
|
return []
|
||||||
|
# print("REtrieve", self.index_path, page)
|
||||||
|
return self.convert_items(page)
|
||||||
|
|
||||||
|
def _get_key_page_index(self, key):
|
||||||
|
key_hash = mmh3.hash(key, signed=False)
|
||||||
|
return key_hash % self.num_pages
|
||||||
|
|
||||||
|
def get_page(self, i):
|
||||||
|
"""
|
||||||
|
Get the page at index i, decompress and deserialise it using JSON
|
||||||
|
"""
|
||||||
|
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||||
|
zeros = page_data.count(b'\x00\x00\x00\x00') * 4
|
||||||
|
try:
|
||||||
|
decompressed_data = self.decompressor.decompress(page_data)
|
||||||
|
except ZstdError:
|
||||||
|
return None
|
||||||
|
results = json.loads(decompressed_data.decode('utf8'))
|
||||||
|
# print(f"Num results: {len(results)}, num zeros: {zeros}")
|
||||||
|
return results
|
||||||
|
|
||||||
|
def convert_items(self, items) -> List[T]:
|
||||||
|
converted = [self.item_factory(*item) for item in items]
|
||||||
|
# print("Converted", items, converted)
|
||||||
|
return converted
|
||||||
|
|
||||||
|
|
||||||
|
class TinyIndex(TinyIndexBase[T]):
|
||||||
|
def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
|
||||||
|
super().__init__(item_factory, num_pages, page_size)
|
||||||
|
# print("REtrieve path", index_path)
|
||||||
|
self.index_path = index_path
|
||||||
|
self.index_file = open(self.index_path, 'rb')
|
||||||
|
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
|
||||||
|
|
||||||
|
|
||||||
|
class TinyIndexer(TinyIndexBase[T]):
|
||||||
|
def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
|
||||||
|
super().__init__(item_factory, num_pages, page_size)
|
||||||
|
self.index_path = index_path
|
||||||
|
self.compressor = ZstdCompressor()
|
||||||
|
self.decompressor = ZstdDecompressor()
|
||||||
|
self.index_file = None
|
||||||
|
self.mmap = None
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
self.create_if_not_exists()
|
||||||
|
self.index_file = open(self.index_path, 'r+b')
|
||||||
|
self.mmap = mmap(self.index_file.fileno(), 0)
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
self.mmap.close()
|
||||||
|
self.index_file.close()
|
||||||
|
|
||||||
|
def index(self, key: str, value: T):
|
||||||
|
# print("Index", value)
|
||||||
|
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||||
|
f" ({self.item_factory.__name__})"
|
||||||
|
page_index = self._get_key_page_index(key)
|
||||||
|
current_page = self.get_page(page_index)
|
||||||
|
if current_page is None:
|
||||||
|
current_page = []
|
||||||
|
value_tuple = astuple(value)
|
||||||
|
# print("Value tuple", value_tuple)
|
||||||
|
current_page.append(value_tuple)
|
||||||
|
try:
|
||||||
|
# print("Page", current_page)
|
||||||
|
self._write_page(current_page, page_index)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _write_page(self, data, i):
|
||||||
|
"""
|
||||||
|
Serialise the data using JSON, compress it and store it at index i.
|
||||||
|
If the data is too big, it will raise a ValueError and not store anything
|
||||||
|
"""
|
||||||
|
serialised_data = json.dumps(data)
|
||||||
|
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
||||||
|
page_length = len(compressed_data)
|
||||||
|
if page_length > self.page_size:
|
||||||
|
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
||||||
|
padding = b'\x00' * (self.page_size - page_length)
|
||||||
|
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
||||||
|
|
||||||
|
def create_if_not_exists(self):
|
||||||
|
if not os.path.isfile(self.index_path):
|
||||||
|
file_length = self.num_pages * self.page_size
|
||||||
|
with open(self.index_path, 'wb') as index_file:
|
||||||
|
index_file.write(b'\x00' * file_length)
|
3
wiki.py
3
wiki.py
|
@ -7,7 +7,8 @@ from urllib.parse import quote
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
|
from index import index_titles_urls_and_extracts
|
||||||
|
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||||
|
|
||||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||||
|
|
Loading…
Reference in a new issue