From c81fc83900b0aff547e36bd49679c3c3808ab6cf Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 5 Jun 2021 22:22:31 +0100 Subject: [PATCH] Abstract index to allow storing anything --- app.py | 72 ++-------------------------------------------- create_app.py | 56 ++++++++++++++++++++++++++++++++++++ index.py | 78 ++++++++++++++++++++++++++++---------------------- performance.py | 11 +++---- 4 files changed, 107 insertions(+), 110 deletions(-) create mode 100644 create_app.py diff --git a/app.py b/app.py index c2a6f82..49752d3 100644 --- a/app.py +++ b/app.py @@ -1,75 +1,7 @@ -import sqlite3 -from functools import lru_cache - -import Levenshtein -from fastapi import FastAPI -from fastapi.staticfiles import StaticFiles -from starlette.responses import FileResponse, RedirectResponse +import create_app from index import TinyIndex, PAGE_SIZE, NUM_PAGES from paths import INDEX_PATH -app = FastAPI() tiny_index = TinyIndex(INDEX_PATH, NUM_PAGES, PAGE_SIZE) - - -@app.get("/search") -def search(s: str): - if '—' in s: - url = s.split('—')[1].strip() - else: - url = f'https://www.google.com/search?q={s}' - return RedirectResponse(url) - - -@lru_cache() -def complete_term(term): - con = sqlite3.connect(INDEX_PATH) - query = f""" - SELECT term - FROM terms - WHERE term >= ? - ORDER BY term - LIMIT 1 - """ - result = con.execute(query, (term,)) - completed = result.fetchone() - # print("Completed", completed) - if len(completed) > 0: - return completed[0] - return None - - -def order_results(query, results): - return sorted(results, key=lambda result: Levenshtein.distance(query, result[0])) - - -@app.get("/complete") -def complete(q: str): - terms = [x.lower() for x in q.replace('.', ' ').split()] - - # completed = complete_term(terms[-1]) - # terms = terms[:-1] + [completed] - - pages = [] - for term in terms: - page = tiny_index.retrieve(term) - if page is not None: - pages += [(title, url) for title, url in page if term in title.lower()] - - ordered_results = order_results(q, pages) - results = [title.replace("\n", "") + ' — ' + - url.replace("\n", "") for title, url in ordered_results] - if len(results) == 0: - # print("No results") - return [] - # print("Results", results) - return [q, results] - - -@app.get('/') -def index(): - return FileResponse('static/index.html') - - -app.mount('/', StaticFiles(directory="static"), name="static") +app = create_app.create(tiny_index) diff --git a/create_app.py b/create_app.py new file mode 100644 index 0000000..0e256ce --- /dev/null +++ b/create_app.py @@ -0,0 +1,56 @@ +import sqlite3 +from functools import lru_cache +from typing import List + +import Levenshtein +from fastapi import FastAPI +from starlette.responses import RedirectResponse, FileResponse +from starlette.staticfiles import StaticFiles + +from index import TinyIndex, Document + + +def create(tiny_index: TinyIndex): + app = FastAPI() + + @app.get("/search") + def search(s: str): + if '—' in s: + url = s.split('—')[1].strip() + else: + url = f'https://www.google.com/search?q={s}' + return RedirectResponse(url) + + def order_results(query, results: List[Document]): + ordered_results = sorted(results, key=lambda result: Levenshtein.distance(query, result.title)) + print("Order results", query, ordered_results, sep='\n') + return ordered_results + + @app.get("/complete") + def complete(q: str): + terms = [x.lower() for x in q.replace('.', ' ').split()] + + # completed = complete_term(terms[-1]) + # terms = terms[:-1] + [completed] + + pages = [] + for term in terms: + items = tiny_index.retrieve(term) + if items is not None: + pages += [item for item in items if term in item.title.lower()] + + ordered_results = order_results(q, pages) + results = [item.title.replace("\n", "") + ' — ' + + item.url.replace("\n", "") for item in ordered_results] + if len(results) == 0: + # print("No results") + return [] + # print("Results", results) + return [q, results] + + @app.get('/') + def index(): + return FileResponse('static/index.html') + + app.mount('/', StaticFiles(directory="static"), name="static") + return app diff --git a/index.py b/index.py index d8fda0a..b1ef6ef 100644 --- a/index.py +++ b/index.py @@ -3,11 +3,12 @@ Create a search index """ import json import os +from abc import ABC, abstractmethod from collections import Counter -from dataclasses import dataclass +from dataclasses import dataclass, fields, asdict, astuple from itertools import islice from mmap import mmap, PROT_READ -from typing import List, Iterator +from typing import List, Iterator, TypeVar, Generic, Iterable from urllib.parse import unquote import justext @@ -47,8 +48,8 @@ def clean(content): @dataclass class Document: - url: str title: str + url: str @dataclass @@ -57,19 +58,24 @@ class TokenizedDocument(Document): class TinyIndexBase: - def __init__(self, num_pages, page_size): + def __init__(self, item_type: type, num_pages: int, page_size: int): + self.item_type = item_type self.num_pages = num_pages self.page_size = page_size self.decompressor = ZstdDecompressor() self.mmap = None - def retrieve(self, token): - index = self._get_token_page_index(token) - return self.get_page(index) + def retrieve(self, key: str): + index = self._get_key_page_index(key) + page = self.get_page(index) + if page is None: + return [] + print("REtrieve", self.index_path, page) + return self.convert_items(page) - def _get_token_page_index(self, token): - token_hash = mmh3.hash(token, signed=False) - return token_hash % self.num_pages + def _get_key_page_index(self, key): + key_hash = mmh3.hash(key, signed=False) + return key_hash % self.num_pages def get_page(self, i): """ @@ -82,18 +88,24 @@ class TinyIndexBase: return None return json.loads(decompressed_data.decode('utf8')) + def convert_items(self, items): + converted = [self.item_type(*item) for item in items] + # print("Converted", items, converted) + return converted + class TinyIndex(TinyIndexBase): def __init__(self, index_path, num_pages, page_size): - super().__init__(num_pages, page_size) + super().__init__(Document, num_pages, page_size) + # print("REtrieve path", index_path) self.index_path = index_path self.index_file = open(self.index_path, 'rb') self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ) class TinyIndexer(TinyIndexBase): - def __init__(self, index_path, num_pages, page_size): - super().__init__(num_pages, page_size) + def __init__(self, item_type: type, index_path: str, num_pages: int, page_size: int): + super().__init__(item_type, num_pages, page_size) self.index_path = index_path self.compressor = ZstdCompressor() self.decompressor = ZstdDecompressor() @@ -110,18 +122,24 @@ class TinyIndexer(TinyIndexBase): self.mmap.close() self.index_file.close() - def index(self, documents: List[TokenizedDocument]): - for document in documents: - for token in document.tokens: - self._index_document(document, token) + # def index(self, documents: List[TokenizedDocument]): + # for document in documents: + # for token in document.tokens: + # self._index_document(document, token) - def _index_document(self, document: Document, token: str): - page_index = self._get_token_page_index(token) + def index(self, key: str, value): + print("Index", value) + assert type(value) == self.item_type, f"Can only index the specified type" \ + f" ({self.item_type.__name__})" + page_index = self._get_key_page_index(key) current_page = self.get_page(page_index) if current_page is None: current_page = [] - current_page.append([document.title, document.url]) + value_tuple = astuple(value) + print("Value tuple", value_tuple) + current_page.append(value_tuple) try: + # print("Page", current_page) self._write_page(current_page, page_index) except ValueError: pass @@ -145,15 +163,6 @@ class TinyIndexer(TinyIndexBase): with open(self.index_path, 'wb') as index_file: index_file.write(b'\x00' * file_length) - def document_indexed(self, url): - raise NotImplementedError() - - def get_num_tokens(self): - raise NotImplementedError() - - def get_random_terms(self, n): - raise NotImplementedError() - def prepare_url_for_tokenizing(url: str): if url.startswith(HTTP_START): @@ -166,7 +175,7 @@ def prepare_url_for_tokenizing(url: str): return url -def get_pages(nlp, titles_and_urls): +def get_pages(nlp, titles_and_urls) -> Iterable[TokenizedDocument]: for i, (title_cleaned, url) in enumerate(titles_and_urls): title_tokens = tokenize(nlp, title_cleaned) prepared_url = prepare_url_for_tokenizing(unquote(url)) @@ -191,11 +200,10 @@ def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls, terms_path terms = Counter() pages = get_pages(nlp, titles_and_urls) - for chunk in grouper(BATCH_SIZE, pages): - indexer.index(list(chunk)) - - for page in chunk: - terms.update([t.lower() for t in page.tokens]) + for page in pages: + for token in page.tokens: + indexer.index(token, Document(url=page.url, title=page.title)) + terms.update([t.lower() for t in page.tokens]) term_df = pd.DataFrame({ 'term': terms.keys(), diff --git a/performance.py b/performance.py index 8f0cb10..9f03b88 100644 --- a/performance.py +++ b/performance.py @@ -3,17 +3,15 @@ Test the performance of the search in terms of compression and speed. """ import os from datetime import datetime -from itertools import islice import numpy as np from spacy.lang.en import English from starlette.testclient import TestClient -from app import app +import create_app from fsqueue import ZstdJsonSerializer -from index import TinyIndexer, index_titles_and_urls +from index import TinyIndexer, index_titles_and_urls, Document, TinyIndex from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH -from wiki import get_wiki_titles_and_urls NUM_DOCUMENTS = 30000 NUM_PAGES_FOR_STATS = 10 @@ -33,7 +31,9 @@ def get_test_pages(): def query_test(): titles_and_urls = get_test_pages() print(f"Got {len(titles_and_urls)} titles and URLs") + tiny_index = TinyIndex(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) + app = create_app.create(tiny_index) client = TestClient(app) start = datetime.now() @@ -80,7 +80,7 @@ def performance_test(): os.remove(TEST_INDEX_PATH) except FileNotFoundError: print("No test index found, creating") - with TinyIndexer(TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer: + with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer: titles_and_urls = get_test_pages() start_time = datetime.now() @@ -106,6 +106,7 @@ def performance_test(): def print_pages(pages): for page in pages: + print("Page", page) for title, url in page: print(title, url) print()