3 年前 · 9c65bf3c8f
--- a/Dockerfile
+++ b/Dockerfile
@@ -0,0 +1,33 @@
 
				+FROM python:3.9-slim-bullseye as base
			
 
				+
			
 
				+ENV PYTHONFAULTHANDLER=1 \
			
 
				+    PYTHONHASHSEED=random \
			
 
				+    PYTHONUNBUFFERED=1
			
 
				+
			
 
				+WORKDIR /app
			
 
				+
			
 
				+FROM base as builder
			
 
				+
			
 
				+ENV PIP_DEFAULT_TIMEOUT=100 \
			
 
				+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
			
 
				+    PIP_NO_CACHE_DIR=1 \
			
 
				+    POETRY_VERSION=1.1.12
			
 
				+
			
 
				+# RUN apk add --no-cache gcc libffi-dev musl-dev postgresql-dev
			
 
				+RUN pip install "poetry==$POETRY_VERSION"
			
 
				+RUN python -m venv /venv
			
 
				+
			
 
				+COPY pyproject.toml poetry.lock ./
			
 
				+RUN poetry export -f requirements.txt | /venv/bin/pip install -r /dev/stdin
			
 
				+
			
 
				+COPY . .
			
 
				+RUN poetry build && /venv/bin/pip install dist/*.whl
			
 
				+
			
 
				+FROM base as final
			
 
				+
			
 
				+#RUN apk add --no-cache libffi libpq
			
 
				+COPY --from=builder /venv /venv
			
 
				+#COPY docker-entrypoint.sh wsgi.py ./
			
 
				+#CMD ["./docker-entrypoint.sh"]
			
 
				+
			
 
				+CMD ["/venv/bin/python", "-m", "tinysearchengine.app"]
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,4 @@
 
				+Tiny Search Engine
			
 
				+==================
			
 
				+
			
 
				+TBD
			
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@@ -1,4 +1,4 @@
 
				-from index import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
			
 
				+from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
			
 
				 from paths import INDEX_PATH
			
 
				 
			
 
				 
			
--- a/app.py
+++ b/app.py
@@ -1,17 +0,0 @@
 
				-import logging
			
 
				-
			
 
				-import uvicorn
			
 
				-
			
 
				-import create_app
			
 
				-
			
 
				-from index import TinyIndex, PAGE_SIZE, NUM_PAGES, Document
			
 
				-from paths import INDEX_PATH
			
 
				-
			
 
				-tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
			
 
				-app = create_app.create(tiny_index)
			
 
				-
			
 
				-logging.basicConfig()
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    uvicorn.run("app:app", host="127.0.0.1", port=8000, log_level="info", reload=True)
			
--- a/index.py
+++ b/index.py
@@ -1,26 +1,16 @@
 
				 """
			
 
				 Create a search index
			
 
				 """
			
 
				-import json
			
 
				-import os
			
 
				-from abc import ABC, abstractmethod
			
 
				 from collections import Counter
			
 
				-from dataclasses import dataclass, fields, asdict, astuple
			
 
				 from itertools import islice
			
 
				-from mmap import mmap, PROT_READ
			
 
				-from typing import List, Iterator, TypeVar, Generic, Iterable, Callable
			
 
				+from typing import Iterator, Iterable
			
 
				 from urllib.parse import unquote
			
 
				 
			
 
				-import justext
			
 
				-import mmh3
			
 
				 import pandas as pd
			
 
				-from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
			
 
				 
			
 
				 # NUM_PAGES = 8192
			
 
				 # PAGE_SIZE = 512
			
 
				-NUM_PAGES = 25600
			
 
				-PAGE_SIZE = 4096
			
 
				-
			
 
				+from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
			
 
				 
			
 
				 NUM_INITIAL_TOKENS = 50
			
 
				 
			
@@ -42,133 +32,6 @@ def tokenize(nlp, cleaned_text):
 
				     return lowered
			
 
				 
			
 
				 
			
 
				-def clean(content):
			
 
				-    text = justext.justext(content, justext.get_stoplist("English"))
			
 
				-    pars = [par.text for par in text if not par.is_boilerplate]
			
 
				-    cleaned_text = ' '.join(pars)
			
 
				-    return cleaned_text
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class Document:
			
 
				-    title: str
			
 
				-    url: str
			
 
				-    extract: str
			
 
				-
			
 
				-
			
 
				-@dataclass
			
 
				-class TokenizedDocument(Document):
			
 
				-    tokens: List[str]
			
 
				-
			
 
				-
			
 
				-T = TypeVar('T')
			
 
				-
			
 
				-
			
 
				-class TinyIndexBase(Generic[T]):
			
 
				-    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
			
 
				-        self.item_factory = item_factory
			
 
				-        self.num_pages = num_pages
			
 
				-        self.page_size = page_size
			
 
				-        self.decompressor = ZstdDecompressor()
			
 
				-        self.mmap = None
			
 
				-
			
 
				-    def retrieve(self, key: str) -> List[T]:
			
 
				-        index = self._get_key_page_index(key)
			
 
				-        page = self.get_page(index)
			
 
				-        if page is None:
			
 
				-            return []
			
 
				-        # print("REtrieve", self.index_path, page)
			
 
				-        return self.convert_items(page)
			
 
				-
			
 
				-    def _get_key_page_index(self, key):
			
 
				-        key_hash = mmh3.hash(key, signed=False)
			
 
				-        return key_hash % self.num_pages
			
 
				-
			
 
				-    def get_page(self, i):
			
 
				-        """
			
 
				-        Get the page at index i, decompress and deserialise it using JSON
			
 
				-        """
			
 
				-        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
			
 
				-        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
			
 
				-        try:
			
 
				-            decompressed_data = self.decompressor.decompress(page_data)
			
 
				-        except ZstdError:
			
 
				-            return None
			
 
				-        results = json.loads(decompressed_data.decode('utf8'))
			
 
				-        # print(f"Num results: {len(results)}, num zeros: {zeros}")
			
 
				-        return results
			
 
				-
			
 
				-    def convert_items(self, items) -> List[T]:
			
 
				-        converted = [self.item_factory(*item) for item in items]
			
 
				-        # print("Converted", items, converted)
			
 
				-        return converted
			
 
				-
			
 
				-
			
 
				-class TinyIndex(TinyIndexBase[T]):
			
 
				-    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
			
 
				-        super().__init__(item_factory, num_pages, page_size)
			
 
				-        # print("REtrieve path", index_path)
			
 
				-        self.index_path = index_path
			
 
				-        self.index_file = open(self.index_path, 'rb')
			
 
				-        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
			
 
				-
			
 
				-
			
 
				-class TinyIndexer(TinyIndexBase[T]):
			
 
				-    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
			
 
				-        super().__init__(item_factory, num_pages, page_size)
			
 
				-        self.index_path = index_path
			
 
				-        self.compressor = ZstdCompressor()
			
 
				-        self.decompressor = ZstdDecompressor()
			
 
				-        self.index_file = None
			
 
				-        self.mmap = None
			
 
				-
			
 
				-    def __enter__(self):
			
 
				-        self.create_if_not_exists()
			
 
				-        self.index_file = open(self.index_path, 'r+b')
			
 
				-        self.mmap = mmap(self.index_file.fileno(), 0)
			
 
				-        return self
			
 
				-
			
 
				-    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				-        self.mmap.close()
			
 
				-        self.index_file.close()
			
 
				-
			
 
				-    def index(self, key: str, value: T):
			
 
				-        # print("Index", value)
			
 
				-        assert type(value) == self.item_factory, f"Can only index the specified type" \
			
 
				-                                              f" ({self.item_factory.__name__})"
			
 
				-        page_index = self._get_key_page_index(key)
			
 
				-        current_page = self.get_page(page_index)
			
 
				-        if current_page is None:
			
 
				-            current_page = []
			
 
				-        value_tuple = astuple(value)
			
 
				-        # print("Value tuple", value_tuple)
			
 
				-        current_page.append(value_tuple)
			
 
				-        try:
			
 
				-            # print("Page", current_page)
			
 
				-            self._write_page(current_page, page_index)
			
 
				-        except ValueError:
			
 
				-            pass
			
 
				-
			
 
				-    def _write_page(self, data, i):
			
 
				-        """
			
 
				-        Serialise the data using JSON, compress it and store it at index i.
			
 
				-        If the data is too big, it will raise a ValueError and not store anything
			
 
				-        """
			
 
				-        serialised_data = json.dumps(data)
			
 
				-        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
			
 
				-        page_length = len(compressed_data)
			
 
				-        if page_length > self.page_size:
			
 
				-            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
			
 
				-        padding = b'\x00' * (self.page_size - page_length)
			
 
				-        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
			
 
				-
			
 
				-    def create_if_not_exists(self):
			
 
				-        if not os.path.isfile(self.index_path):
			
 
				-            file_length = self.num_pages * self.page_size
			
 
				-            with open(self.index_path, 'wb') as index_file:
			
 
				-                index_file.write(b'\x00' * file_length)
			
 
				-
			
 
				-
			
 
				 def prepare_url_for_tokenizing(url: str):
			
 
				     if url.startswith(HTTP_START):
			
 
				         url = url[len(HTTP_START):]
			
--- a/index_glob.py
+++ b/index_glob.py
@@ -4,7 +4,8 @@ from glob import glob
 
				 import bs4
			
 
				 from spacy.lang.en import English
			
 
				 
			
 
				-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
			
 
				+from index import tokenize
			
 
				+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
			
 
				 from paths import INDEX_PATH, CRAWL_GLOB
			
 
				 
			
 
				 
			
@@ -36,3 +37,10 @@ def run():
 
				 
			
 
				 if __name__ == '__main__':
			
 
				     run()
			
 
				+
			
 
				+
			
 
				+def clean(content):
			
 
				+    text = justext.justext(content, justext.get_stoplist("English"))
			
 
				+    pars = [par.text for par in text if not par.is_boilerplate]
			
 
				+    cleaned_text = ' '.join(pars)
			
 
				+    return cleaned_text
			
--- a/index_queue.py
+++ b/index_queue.py
@@ -4,7 +4,8 @@ Index items in the file-system queue
 
				 from spacy.lang.en import English
			
 
				 
			
 
				 from fsqueue import FSQueue, ZstdJsonSerializer
			
 
				-from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, index_titles_urls_and_extracts
			
 
				+from index import index_titles_urls_and_extracts
			
 
				+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
			
 
				 from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
			
 
				 
			
 
				 
			
--- a/indexcc.py
+++ b/indexcc.py
@@ -8,7 +8,8 @@ from logging import getLogger
 
				 import spacy
			
 
				 
			
 
				 from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
			
 
				-from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES, Document
			
 
				+from index import index_titles_urls_and_extracts
			
 
				+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
			
 
				 from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
			
 
				 
			
 
				 
			
--- a/performance.py
+++ b/performance.py
@@ -8,9 +8,10 @@ import numpy as np
 
				 from spacy.lang.en import English
			
 
				 from starlette.testclient import TestClient
			
 
				 
			
 
				-import create_app
			
 
				+from tinysearchengine import create_app
			
 
				 from fsqueue import ZstdJsonSerializer
			
 
				-from index import TinyIndexer, index_titles_urls_and_extracts, Document, TinyIndex
			
 
				+from index import index_titles_urls_and_extracts
			
 
				+from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
			
 
				 from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
			
 
				 
			
 
				 NUM_DOCUMENTS = 30000
			
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,26 +6,26 @@ authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
 
				 
			
 
				 [tool.poetry.dependencies]
			
 
				 python = "^3.9"
			
 
				-botocore = "^1.23.20"
			
 
				-boto3 = "^1.20.20"
			
 
				-ujson = "^4.3.0"
			
 
				-warcio = "^1.7.4"
			
 
				-idna = "^3.3"
			
 
				-beautifulsoup4 = "^4.10.0"
			
 
				-lxml = "^4.6.4"
			
 
				-jusText = "^3.0.0"
			
 
				+# botocore = "^1.23.20"
			
 
				+# boto3 = "^1.20.20"
			
 
				+# ujson = "^4.3.0"
			
 
				+# warcio = "^1.7.4"
			
 
				+# idna = "^3.3"
			
 
				+# beautifulsoup4 = "^4.10.0"
			
 
				+# lxml = "^4.6.4"
			
 
				+# jusText = "^3.0.0"
			
 
				 pandas = "^1.3.4"
			
 
				-pyspark = "^3.2.0"
			
 
				-langdetect = "^1.0.9"
			
 
				+# pyspark = "^3.2.0"
			
 
				+# langdetect = "^1.0.9"
			
 
				 zstandard = "^0.16.0"
			
 
				-spacy = "^3.2.1"
			
 
				+# spacy = "^3.2.1"
			
 
				 mmh3 = "^3.0.0"
			
 
				 fastapi = "^0.70.1"
			
 
				-Levenshtein = "^0.16.0"
			
 
				+# Levenshtein = "^0.16.0"
			
 
				 uvicorn = "^0.16.0"
			
 
				 
			
 
				-[tool.poetry.dependencies.en_core_web_sm]
			
 
				-url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
			
 
				+# [tool.poetry.dependencies.en_core_web_sm]
			
 
				+# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl"
			
 
				 
			
 
				 [tool.poetry.dev-dependencies]
			
 
				 
			
--- a/setup.cfg
+++ b/setup.cfg
@@ -0,0 +1,24 @@
 
				+[metadata]
			
 
				+name = tiny-search-engine-daoudc
			
 
				+version = 0.0.1
			
 
				+author = Daoud Clarke
			
 
				+author_email = daoud.clarke@gmail.com
			
 
				+description = Tiny Search Engine
			
 
				+long_description = file: README.md
			
 
				+long_description_content_type = text/markdown
			
 
				+# url = https://github.com/pypa/sampleproject
			
 
				+# project_urls =
			
 
				+#     Bug Tracker = https://github.com/pypa/sampleproject/issues
			
 
				+# classifiers =
			
 
				+#     Programming Language :: Python :: 3
			
 
				+#     License :: OSI Approved :: MIT License
			
 
				+#     Operating System :: OS Independent
			
 
				+
			
 
				+[options]
			
 
				+package_dir =
			
 
				+    = src
			
 
				+packages = find:
			
 
				+python_requires = >=3.9
			
 
				+
			
 
				+[options.packages.find]
			
 
				+where = src
			
--- a/tinysearchengine/__init__.py
+++ b/tinysearchengine/__init__.py
--- a/tinysearchengine/app.py
+++ b/tinysearchengine/app.py
@@ -0,0 +1,17 @@
 
				+import logging
			
 
				+
			
 
				+import uvicorn
			
 
				+
			
 
				+from tinysearchengine import create_app
			
 
				+
			
 
				+from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
			
 
				+from paths import INDEX_PATH
			
 
				+
			
 
				+tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
			
 
				+app = create_app.create(tiny_index)
			
 
				+
			
 
				+logging.basicConfig()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    uvicorn.run("tinysearchengine.app:app", host="127.0.0.1", port=8080, log_level="info", reload=True)
			
--- a/tinysearchengine/create_app.py
+++ b/tinysearchengine/create_app.py
@@ -6,7 +6,7 @@ from fastapi import FastAPI
 
				 from starlette.responses import FileResponse
			
 
				 from starlette.staticfiles import StaticFiles
			
 
				 
			
 
				-from index import TinyIndex, Document
			
 
				+from tinysearchengine.indexer import TinyIndex, Document
			
 
				 
			
 
				 logger = getLogger(__name__)
			
 
				 
			
@@ -107,7 +107,7 @@ def create(tiny_index: TinyIndex):
 
				 
			
 
				     @app.get('/')
			
 
				     def index():
			
 
				-        return FileResponse('static/index.html')
			
 
				+        return FileResponse('tinysearchengine/static/index.html')
			
 
				 
			
 
				-    app.mount('/', StaticFiles(directory="static"), name="static")
			
 
				+    app.mount('/', StaticFiles(directory="tinysearchengine/static"), name="static")
			
 
				     return app
			
--- a/tinysearchengine/indexer.py
+++ b/tinysearchengine/indexer.py
@@ -0,0 +1,131 @@
 
				+import json
			
 
				+import os
			
 
				+from dataclasses import astuple, dataclass
			
 
				+from mmap import mmap, PROT_READ
			
 
				+from typing import TypeVar, Generic, Callable, List
			
 
				+
			
 
				+import mmh3
			
 
				+from zstandard import ZstdDecompressor
			
 
				+
			
 
				+NUM_PAGES = 25600
			
 
				+PAGE_SIZE = 4096
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class Document:
			
 
				+    title: str
			
 
				+    url: str
			
 
				+    extract: str
			
 
				+
			
 
				+
			
 
				+@dataclass
			
 
				+class TokenizedDocument(Document):
			
 
				+    tokens: List[str]
			
 
				+
			
 
				+
			
 
				+T = TypeVar('T')
			
 
				+
			
 
				+
			
 
				+class TinyIndexBase(Generic[T]):
			
 
				+    def __init__(self, item_factory: Callable[..., T], num_pages: int, page_size: int):
			
 
				+        self.item_factory = item_factory
			
 
				+        self.num_pages = num_pages
			
 
				+        self.page_size = page_size
			
 
				+        self.decompressor = ZstdDecompressor()
			
 
				+        self.mmap = None
			
 
				+
			
 
				+    def retrieve(self, key: str) -> List[T]:
			
 
				+        index = self._get_key_page_index(key)
			
 
				+        page = self.get_page(index)
			
 
				+        if page is None:
			
 
				+            return []
			
 
				+        # print("REtrieve", self.index_path, page)
			
 
				+        return self.convert_items(page)
			
 
				+
			
 
				+    def _get_key_page_index(self, key):
			
 
				+        key_hash = mmh3.hash(key, signed=False)
			
 
				+        return key_hash % self.num_pages
			
 
				+
			
 
				+    def get_page(self, i):
			
 
				+        """
			
 
				+        Get the page at index i, decompress and deserialise it using JSON
			
 
				+        """
			
 
				+        page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
			
 
				+        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
			
 
				+        try:
			
 
				+            decompressed_data = self.decompressor.decompress(page_data)
			
 
				+        except ZstdError:
			
 
				+            return None
			
 
				+        results = json.loads(decompressed_data.decode('utf8'))
			
 
				+        # print(f"Num results: {len(results)}, num zeros: {zeros}")
			
 
				+        return results
			
 
				+
			
 
				+    def convert_items(self, items) -> List[T]:
			
 
				+        converted = [self.item_factory(*item) for item in items]
			
 
				+        # print("Converted", items, converted)
			
 
				+        return converted
			
 
				+
			
 
				+
			
 
				+class TinyIndex(TinyIndexBase[T]):
			
 
				+    def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
			
 
				+        super().__init__(item_factory, num_pages, page_size)
			
 
				+        # print("REtrieve path", index_path)
			
 
				+        self.index_path = index_path
			
 
				+        self.index_file = open(self.index_path, 'rb')
			
 
				+        self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
			
 
				+
			
 
				+
			
 
				+class TinyIndexer(TinyIndexBase[T]):
			
 
				+    def __init__(self, item_factory: Callable[..., T], index_path: str, num_pages: int, page_size: int):
			
 
				+        super().__init__(item_factory, num_pages, page_size)
			
 
				+        self.index_path = index_path
			
 
				+        self.compressor = ZstdCompressor()
			
 
				+        self.decompressor = ZstdDecompressor()
			
 
				+        self.index_file = None
			
 
				+        self.mmap = None
			
 
				+
			
 
				+    def __enter__(self):
			
 
				+        self.create_if_not_exists()
			
 
				+        self.index_file = open(self.index_path, 'r+b')
			
 
				+        self.mmap = mmap(self.index_file.fileno(), 0)
			
 
				+        return self
			
 
				+
			
 
				+    def __exit__(self, exc_type, exc_val, exc_tb):
			
 
				+        self.mmap.close()
			
 
				+        self.index_file.close()
			
 
				+
			
 
				+    def index(self, key: str, value: T):
			
 
				+        # print("Index", value)
			
 
				+        assert type(value) == self.item_factory, f"Can only index the specified type" \
			
 
				+                                              f" ({self.item_factory.__name__})"
			
 
				+        page_index = self._get_key_page_index(key)
			
 
				+        current_page = self.get_page(page_index)
			
 
				+        if current_page is None:
			
 
				+            current_page = []
			
 
				+        value_tuple = astuple(value)
			
 
				+        # print("Value tuple", value_tuple)
			
 
				+        current_page.append(value_tuple)
			
 
				+        try:
			
 
				+            # print("Page", current_page)
			
 
				+            self._write_page(current_page, page_index)
			
 
				+        except ValueError:
			
 
				+            pass
			
 
				+
			
 
				+    def _write_page(self, data, i):
			
 
				+        """
			
 
				+        Serialise the data using JSON, compress it and store it at index i.
			
 
				+        If the data is too big, it will raise a ValueError and not store anything
			
 
				+        """
			
 
				+        serialised_data = json.dumps(data)
			
 
				+        compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
			
 
				+        page_length = len(compressed_data)
			
 
				+        if page_length > self.page_size:
			
 
				+            raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
			
 
				+        padding = b'\x00' * (self.page_size - page_length)
			
 
				+        self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
			
 
				+
			
 
				+    def create_if_not_exists(self):
			
 
				+        if not os.path.isfile(self.index_path):
			
 
				+            file_length = self.num_pages * self.page_size
			
 
				+            with open(self.index_path, 'wb') as index_file:
			
 
				+                index_file.write(b'\x00' * file_length)
			
--- a/tinysearchengine/static/index.css
+++ b/tinysearchengine/static/index.css
--- a/tinysearchengine/static/index.html
+++ b/tinysearchengine/static/index.html
--- a/tinysearchengine/static/index.js
+++ b/tinysearchengine/static/index.js
--- a/tinysearchengine/static/landing.html
+++ b/tinysearchengine/static/landing.html
--- a/tinysearchengine/static/plugin.xml
+++ b/tinysearchengine/static/plugin.xml
--- a/tinysearchengine/static/search.html
+++ b/tinysearchengine/static/search.html
--- a/tinysearchengine/static/typeahead.css
+++ b/tinysearchengine/static/typeahead.css
--- a/tinysearchengine/static/typeahead.js
+++ b/tinysearchengine/static/typeahead.js
--- a/wiki.py
+++ b/wiki.py
@@ -7,7 +7,8 @@ from urllib.parse import quote
 
				 
			
 
				 from spacy.lang.en import English
			
 
				 
			
 
				-from index import TinyIndexer, index_titles_urls_and_extracts, PAGE_SIZE, NUM_PAGES
			
 
				+from index import index_titles_urls_and_extracts
			
 
				+from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
			
 
				 from paths import WIKI_TITLES_PATH, INDEX_PATH
			
 
				 
			
 
				 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']