Abstract index to allow storing anything

This commit is contained in:
Daoud Clarke 2021-06-05 22:22:31 +01:00
parent fb5b6ffd45
commit c81fc83900
4 changed files with 107 additions and 110 deletions

72
app.py
View file

@ -1,75 +1,7 @@
import sqlite3
from functools import lru_cache
import Levenshtein
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from starlette.responses import FileResponse, RedirectResponse
import create_app
from index import TinyIndex, PAGE_SIZE, NUM_PAGES
from paths import INDEX_PATH
app = FastAPI()
tiny_index = TinyIndex(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
@app.get("/search")
def search(s: str):
if '' in s:
url = s.split('')[1].strip()
else:
url = f'https://www.google.com/search?q={s}'
return RedirectResponse(url)
@lru_cache()
def complete_term(term):
con = sqlite3.connect(INDEX_PATH)
query = f"""
SELECT term
FROM terms
WHERE term >= ?
ORDER BY term
LIMIT 1
"""
result = con.execute(query, (term,))
completed = result.fetchone()
# print("Completed", completed)
if len(completed) > 0:
return completed[0]
return None
def order_results(query, results):
return sorted(results, key=lambda result: Levenshtein.distance(query, result[0]))
@app.get("/complete")
def complete(q: str):
terms = [x.lower() for x in q.replace('.', ' ').split()]
# completed = complete_term(terms[-1])
# terms = terms[:-1] + [completed]
pages = []
for term in terms:
page = tiny_index.retrieve(term)
if page is not None:
pages += [(title, url) for title, url in page if term in title.lower()]
ordered_results = order_results(q, pages)
results = [title.replace("\n", "") + '' +
url.replace("\n", "") for title, url in ordered_results]
if len(results) == 0:
# print("No results")
return []
# print("Results", results)
return [q, results]
@app.get('/')
def index():
return FileResponse('static/index.html')
app.mount('/', StaticFiles(directory="static"), name="static")
app = create_app.create(tiny_index)

56
create_app.py Normal file
View file

@ -0,0 +1,56 @@
import sqlite3
from functools import lru_cache
from typing import List
import Levenshtein
from fastapi import FastAPI
from starlette.responses import RedirectResponse, FileResponse
from starlette.staticfiles import StaticFiles
from index import TinyIndex, Document
def create(tiny_index: TinyIndex):
app = FastAPI()
@app.get("/search")
def search(s: str):
if '' in s:
url = s.split('')[1].strip()
else:
url = f'https://www.google.com/search?q={s}'
return RedirectResponse(url)
def order_results(query, results: List[Document]):
ordered_results = sorted(results, key=lambda result: Levenshtein.distance(query, result.title))
print("Order results", query, ordered_results, sep='\n')
return ordered_results
@app.get("/complete")
def complete(q: str):
terms = [x.lower() for x in q.replace('.', ' ').split()]
# completed = complete_term(terms[-1])
# terms = terms[:-1] + [completed]
pages = []
for term in terms:
items = tiny_index.retrieve(term)
if items is not None:
pages += [item for item in items if term in item.title.lower()]
ordered_results = order_results(q, pages)
results = [item.title.replace("\n", "") + '' +
item.url.replace("\n", "") for item in ordered_results]
if len(results) == 0:
# print("No results")
return []
# print("Results", results)
return [q, results]
@app.get('/')
def index():
return FileResponse('static/index.html')
app.mount('/', StaticFiles(directory="static"), name="static")
return app

View file

@ -3,11 +3,12 @@ Create a search index
"""
import json
import os
from abc import ABC, abstractmethod
from collections import Counter
from dataclasses import dataclass
from dataclasses import dataclass, fields, asdict, astuple
from itertools import islice
from mmap import mmap, PROT_READ
from typing import List, Iterator
from typing import List, Iterator, TypeVar, Generic, Iterable
from urllib.parse import unquote
import justext
@ -47,8 +48,8 @@ def clean(content):
@dataclass
class Document:
url: str
title: str
url: str
@dataclass
@ -57,19 +58,24 @@ class TokenizedDocument(Document):
class TinyIndexBase:
def __init__(self, num_pages, page_size):
def __init__(self, item_type: type, num_pages: int, page_size: int):
self.item_type = item_type
self.num_pages = num_pages
self.page_size = page_size
self.decompressor = ZstdDecompressor()
self.mmap = None
def retrieve(self, token):
index = self._get_token_page_index(token)
return self.get_page(index)
def retrieve(self, key: str):
index = self._get_key_page_index(key)
page = self.get_page(index)
if page is None:
return []
print("REtrieve", self.index_path, page)
return self.convert_items(page)
def _get_token_page_index(self, token):
token_hash = mmh3.hash(token, signed=False)
return token_hash % self.num_pages
def _get_key_page_index(self, key):
key_hash = mmh3.hash(key, signed=False)
return key_hash % self.num_pages
def get_page(self, i):
"""
@ -82,18 +88,24 @@ class TinyIndexBase:
return None
return json.loads(decompressed_data.decode('utf8'))
def convert_items(self, items):
converted = [self.item_type(*item) for item in items]
# print("Converted", items, converted)
return converted
class TinyIndex(TinyIndexBase):
def __init__(self, index_path, num_pages, page_size):
super().__init__(num_pages, page_size)
super().__init__(Document, num_pages, page_size)
# print("REtrieve path", index_path)
self.index_path = index_path
self.index_file = open(self.index_path, 'rb')
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
class TinyIndexer(TinyIndexBase):
def __init__(self, index_path, num_pages, page_size):
super().__init__(num_pages, page_size)
def __init__(self, item_type: type, index_path: str, num_pages: int, page_size: int):
super().__init__(item_type, num_pages, page_size)
self.index_path = index_path
self.compressor = ZstdCompressor()
self.decompressor = ZstdDecompressor()
@ -110,18 +122,24 @@ class TinyIndexer(TinyIndexBase):
self.mmap.close()
self.index_file.close()
def index(self, documents: List[TokenizedDocument]):
for document in documents:
for token in document.tokens:
self._index_document(document, token)
# def index(self, documents: List[TokenizedDocument]):
# for document in documents:
# for token in document.tokens:
# self._index_document(document, token)
def _index_document(self, document: Document, token: str):
page_index = self._get_token_page_index(token)
def index(self, key: str, value):
print("Index", value)
assert type(value) == self.item_type, f"Can only index the specified type" \
f" ({self.item_type.__name__})"
page_index = self._get_key_page_index(key)
current_page = self.get_page(page_index)
if current_page is None:
current_page = []
current_page.append([document.title, document.url])
value_tuple = astuple(value)
print("Value tuple", value_tuple)
current_page.append(value_tuple)
try:
# print("Page", current_page)
self._write_page(current_page, page_index)
except ValueError:
pass
@ -145,15 +163,6 @@ class TinyIndexer(TinyIndexBase):
with open(self.index_path, 'wb') as index_file:
index_file.write(b'\x00' * file_length)
def document_indexed(self, url):
raise NotImplementedError()
def get_num_tokens(self):
raise NotImplementedError()
def get_random_terms(self, n):
raise NotImplementedError()
def prepare_url_for_tokenizing(url: str):
if url.startswith(HTTP_START):
@ -166,7 +175,7 @@ def prepare_url_for_tokenizing(url: str):
return url
def get_pages(nlp, titles_and_urls):
def get_pages(nlp, titles_and_urls) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url) in enumerate(titles_and_urls):
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
@ -191,11 +200,10 @@ def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path
terms = Counter()
pages = get_pages(nlp, titles_and_urls)
for chunk in grouper(BATCH_SIZE, pages):
indexer.index(list(chunk))
for page in chunk:
terms.update([t.lower() for t in page.tokens])
for page in pages:
for token in page.tokens:
indexer.index(token, Document(url=page.url, title=page.title))
terms.update([t.lower() for t in page.tokens])
term_df = pd.DataFrame({
'term': terms.keys(),

View file

@ -3,17 +3,15 @@ Test the performance of the search in terms of compression and speed.
"""
import os
from datetime import datetime
from itertools import islice
import numpy as np
from spacy.lang.en import English
from starlette.testclient import TestClient
from app import app
import create_app
from fsqueue import ZstdJsonSerializer
from index import TinyIndexer, index_titles_and_urls
from index import TinyIndexer, index_titles_and_urls, Document, TinyIndex
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
from wiki import get_wiki_titles_and_urls
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10
@ -33,7 +31,9 @@ def get_test_pages():
def query_test():
titles_and_urls = get_test_pages()
print(f"Got {len(titles_and_urls)} titles and URLs")
tiny_index = TinyIndex(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
app = create_app.create(tiny_index)
client = TestClient(app)
start = datetime.now()
@ -80,7 +80,7 @@ def performance_test():
os.remove(TEST_INDEX_PATH)
except FileNotFoundError:
print("No test index found, creating")
with TinyIndexer(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
titles_and_urls = get_test_pages()
start_time = datetime.now()
@ -106,6 +106,7 @@ def performance_test():
def print_pages(pages):
for page in pages:
print("Page", page)
for title, url in page:
print(title, url)
print()