Abstract index to allow storing anything
This commit is contained in:
parent
fb5b6ffd45
commit
c81fc83900
4 changed files with 107 additions and 110 deletions
72
app.py
72
app.py
|
@ -1,75 +1,7 @@
|
|||
import sqlite3
|
||||
from functools import lru_cache
|
||||
|
||||
import Levenshtein
|
||||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from starlette.responses import FileResponse, RedirectResponse
|
||||
import create_app
|
||||
|
||||
from index import TinyIndex, PAGE_SIZE, NUM_PAGES
|
||||
from paths import INDEX_PATH
|
||||
|
||||
app = FastAPI()
|
||||
tiny_index = TinyIndex(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
|
||||
|
||||
@app.get("/search")
|
||||
def search(s: str):
|
||||
if '—' in s:
|
||||
url = s.split('—')[1].strip()
|
||||
else:
|
||||
url = f'https://www.google.com/search?q={s}'
|
||||
return RedirectResponse(url)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def complete_term(term):
|
||||
con = sqlite3.connect(INDEX_PATH)
|
||||
query = f"""
|
||||
SELECT term
|
||||
FROM terms
|
||||
WHERE term >= ?
|
||||
ORDER BY term
|
||||
LIMIT 1
|
||||
"""
|
||||
result = con.execute(query, (term,))
|
||||
completed = result.fetchone()
|
||||
# print("Completed", completed)
|
||||
if len(completed) > 0:
|
||||
return completed[0]
|
||||
return None
|
||||
|
||||
|
||||
def order_results(query, results):
|
||||
return sorted(results, key=lambda result: Levenshtein.distance(query, result[0]))
|
||||
|
||||
|
||||
@app.get("/complete")
|
||||
def complete(q: str):
|
||||
terms = [x.lower() for x in q.replace('.', ' ').split()]
|
||||
|
||||
# completed = complete_term(terms[-1])
|
||||
# terms = terms[:-1] + [completed]
|
||||
|
||||
pages = []
|
||||
for term in terms:
|
||||
page = tiny_index.retrieve(term)
|
||||
if page is not None:
|
||||
pages += [(title, url) for title, url in page if term in title.lower()]
|
||||
|
||||
ordered_results = order_results(q, pages)
|
||||
results = [title.replace("\n", "") + ' — ' +
|
||||
url.replace("\n", "") for title, url in ordered_results]
|
||||
if len(results) == 0:
|
||||
# print("No results")
|
||||
return []
|
||||
# print("Results", results)
|
||||
return [q, results]
|
||||
|
||||
|
||||
@app.get('/')
|
||||
def index():
|
||||
return FileResponse('static/index.html')
|
||||
|
||||
|
||||
app.mount('/', StaticFiles(directory="static"), name="static")
|
||||
app = create_app.create(tiny_index)
|
||||
|
|
56
create_app.py
Normal file
56
create_app.py
Normal file
|
@ -0,0 +1,56 @@
|
|||
import sqlite3
|
||||
from functools import lru_cache
|
||||
from typing import List
|
||||
|
||||
import Levenshtein
|
||||
from fastapi import FastAPI
|
||||
from starlette.responses import RedirectResponse, FileResponse
|
||||
from starlette.staticfiles import StaticFiles
|
||||
|
||||
from index import TinyIndex, Document
|
||||
|
||||
|
||||
def create(tiny_index: TinyIndex):
|
||||
app = FastAPI()
|
||||
|
||||
@app.get("/search")
|
||||
def search(s: str):
|
||||
if '—' in s:
|
||||
url = s.split('—')[1].strip()
|
||||
else:
|
||||
url = f'https://www.google.com/search?q={s}'
|
||||
return RedirectResponse(url)
|
||||
|
||||
def order_results(query, results: List[Document]):
|
||||
ordered_results = sorted(results, key=lambda result: Levenshtein.distance(query, result.title))
|
||||
print("Order results", query, ordered_results, sep='\n')
|
||||
return ordered_results
|
||||
|
||||
@app.get("/complete")
|
||||
def complete(q: str):
|
||||
terms = [x.lower() for x in q.replace('.', ' ').split()]
|
||||
|
||||
# completed = complete_term(terms[-1])
|
||||
# terms = terms[:-1] + [completed]
|
||||
|
||||
pages = []
|
||||
for term in terms:
|
||||
items = tiny_index.retrieve(term)
|
||||
if items is not None:
|
||||
pages += [item for item in items if term in item.title.lower()]
|
||||
|
||||
ordered_results = order_results(q, pages)
|
||||
results = [item.title.replace("\n", "") + ' — ' +
|
||||
item.url.replace("\n", "") for item in ordered_results]
|
||||
if len(results) == 0:
|
||||
# print("No results")
|
||||
return []
|
||||
# print("Results", results)
|
||||
return [q, results]
|
||||
|
||||
@app.get('/')
|
||||
def index():
|
||||
return FileResponse('static/index.html')
|
||||
|
||||
app.mount('/', StaticFiles(directory="static"), name="static")
|
||||
return app
|
78
index.py
78
index.py
|
@ -3,11 +3,12 @@ Create a search index
|
|||
"""
|
||||
import json
|
||||
import os
|
||||
from abc import ABC, abstractmethod
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from dataclasses import dataclass, fields, asdict, astuple
|
||||
from itertools import islice
|
||||
from mmap import mmap, PROT_READ
|
||||
from typing import List, Iterator
|
||||
from typing import List, Iterator, TypeVar, Generic, Iterable
|
||||
from urllib.parse import unquote
|
||||
|
||||
import justext
|
||||
|
@ -47,8 +48,8 @@ def clean(content):
|
|||
|
||||
@dataclass
|
||||
class Document:
|
||||
url: str
|
||||
title: str
|
||||
url: str
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -57,19 +58,24 @@ class TokenizedDocument(Document):
|
|||
|
||||
|
||||
class TinyIndexBase:
|
||||
def __init__(self, num_pages, page_size):
|
||||
def __init__(self, item_type: type, num_pages: int, page_size: int):
|
||||
self.item_type = item_type
|
||||
self.num_pages = num_pages
|
||||
self.page_size = page_size
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.mmap = None
|
||||
|
||||
def retrieve(self, token):
|
||||
index = self._get_token_page_index(token)
|
||||
return self.get_page(index)
|
||||
def retrieve(self, key: str):
|
||||
index = self._get_key_page_index(key)
|
||||
page = self.get_page(index)
|
||||
if page is None:
|
||||
return []
|
||||
print("REtrieve", self.index_path, page)
|
||||
return self.convert_items(page)
|
||||
|
||||
def _get_token_page_index(self, token):
|
||||
token_hash = mmh3.hash(token, signed=False)
|
||||
return token_hash % self.num_pages
|
||||
def _get_key_page_index(self, key):
|
||||
key_hash = mmh3.hash(key, signed=False)
|
||||
return key_hash % self.num_pages
|
||||
|
||||
def get_page(self, i):
|
||||
"""
|
||||
|
@ -82,18 +88,24 @@ class TinyIndexBase:
|
|||
return None
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
def convert_items(self, items):
|
||||
converted = [self.item_type(*item) for item in items]
|
||||
# print("Converted", items, converted)
|
||||
return converted
|
||||
|
||||
|
||||
class TinyIndex(TinyIndexBase):
|
||||
def __init__(self, index_path, num_pages, page_size):
|
||||
super().__init__(num_pages, page_size)
|
||||
super().__init__(Document, num_pages, page_size)
|
||||
# print("REtrieve path", index_path)
|
||||
self.index_path = index_path
|
||||
self.index_file = open(self.index_path, 'rb')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
|
||||
|
||||
|
||||
class TinyIndexer(TinyIndexBase):
|
||||
def __init__(self, index_path, num_pages, page_size):
|
||||
super().__init__(num_pages, page_size)
|
||||
def __init__(self, item_type: type, index_path: str, num_pages: int, page_size: int):
|
||||
super().__init__(item_type, num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.compressor = ZstdCompressor()
|
||||
self.decompressor = ZstdDecompressor()
|
||||
|
@ -110,18 +122,24 @@ class TinyIndexer(TinyIndexBase):
|
|||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
def index(self, documents: List[TokenizedDocument]):
|
||||
for document in documents:
|
||||
for token in document.tokens:
|
||||
self._index_document(document, token)
|
||||
# def index(self, documents: List[TokenizedDocument]):
|
||||
# for document in documents:
|
||||
# for token in document.tokens:
|
||||
# self._index_document(document, token)
|
||||
|
||||
def _index_document(self, document: Document, token: str):
|
||||
page_index = self._get_token_page_index(token)
|
||||
def index(self, key: str, value):
|
||||
print("Index", value)
|
||||
assert type(value) == self.item_type, f"Can only index the specified type" \
|
||||
f" ({self.item_type.__name__})"
|
||||
page_index = self._get_key_page_index(key)
|
||||
current_page = self.get_page(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
current_page.append([document.title, document.url])
|
||||
value_tuple = astuple(value)
|
||||
print("Value tuple", value_tuple)
|
||||
current_page.append(value_tuple)
|
||||
try:
|
||||
# print("Page", current_page)
|
||||
self._write_page(current_page, page_index)
|
||||
except ValueError:
|
||||
pass
|
||||
|
@ -145,15 +163,6 @@ class TinyIndexer(TinyIndexBase):
|
|||
with open(self.index_path, 'wb') as index_file:
|
||||
index_file.write(b'\x00' * file_length)
|
||||
|
||||
def document_indexed(self, url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_num_tokens(self):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_random_terms(self, n):
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def prepare_url_for_tokenizing(url: str):
|
||||
if url.startswith(HTTP_START):
|
||||
|
@ -166,7 +175,7 @@ def prepare_url_for_tokenizing(url: str):
|
|||
return url
|
||||
|
||||
|
||||
def get_pages(nlp, titles_and_urls):
|
||||
def get_pages(nlp, titles_and_urls) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url) in enumerate(titles_and_urls):
|
||||
title_tokens = tokenize(nlp, title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
|
@ -191,11 +200,10 @@ def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path
|
|||
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_and_urls)
|
||||
for chunk in grouper(BATCH_SIZE, pages):
|
||||
indexer.index(list(chunk))
|
||||
|
||||
for page in chunk:
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
for page in pages:
|
||||
for token in page.tokens:
|
||||
indexer.index(token, Document(url=page.url, title=page.title))
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
|
||||
term_df = pd.DataFrame({
|
||||
'term': terms.keys(),
|
||||
|
|
|
@ -3,17 +3,15 @@ Test the performance of the search in terms of compression and speed.
|
|||
"""
|
||||
import os
|
||||
from datetime import datetime
|
||||
from itertools import islice
|
||||
|
||||
import numpy as np
|
||||
from spacy.lang.en import English
|
||||
from starlette.testclient import TestClient
|
||||
|
||||
from app import app
|
||||
import create_app
|
||||
from fsqueue import ZstdJsonSerializer
|
||||
from index import TinyIndexer, index_titles_and_urls
|
||||
from index import TinyIndexer, index_titles_and_urls, Document, TinyIndex
|
||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
from wiki import get_wiki_titles_and_urls
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
||||
|
@ -33,7 +31,9 @@ def get_test_pages():
|
|||
def query_test():
|
||||
titles_and_urls = get_test_pages()
|
||||
print(f"Got {len(titles_and_urls)} titles and URLs")
|
||||
tiny_index = TinyIndex(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
|
||||
|
||||
app = create_app.create(tiny_index)
|
||||
client = TestClient(app)
|
||||
|
||||
start = datetime.now()
|
||||
|
@ -80,7 +80,7 @@ def performance_test():
|
|||
os.remove(TEST_INDEX_PATH)
|
||||
except FileNotFoundError:
|
||||
print("No test index found, creating")
|
||||
with TinyIndexer(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
|
||||
with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_test_pages()
|
||||
|
||||
start_time = datetime.now()
|
||||
|
@ -106,6 +106,7 @@ def performance_test():
|
|||
|
||||
def print_pages(pages):
|
||||
for page in pages:
|
||||
print("Page", page)
|
||||
for title, url in page:
|
||||
print(title, url)
|
||||
print()
|
||||
|
|
Loading…
Reference in a new issue