Index using compression

This commit is contained in:
Daoud Clarke 2021-04-12 18:37:33 +01:00
parent 634e490cff
commit acc2a9194e
4 changed files with 121 additions and 92 deletions

171
index.py
View file

@ -2,24 +2,32 @@
Create a search index
"""
import gzip
import json
import os
import sqlite3
from dataclasses import dataclass
from glob import glob
from itertools import chain, count, islice
from mmap import mmap
from typing import List, Iterator
from urllib.parse import unquote
import bs4
import justext
import mmh3
from spacy.lang.en import English
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
from paths import CRAWL_GLOB, INDEX_PATH
NUM_PAGES = 8192
PAGE_SIZE = 512
NUM_INITIAL_TOKENS = 50
HTTP_START = 'http://'
HTTPS_START = 'https://'
BATCH_SIZE = 10000
BATCH_SIZE = 100
def is_content_token(nlp, token):
@ -43,91 +51,112 @@ def clean(content):
@dataclass
class Page:
tokens: List[str]
class Document:
url: str
title: str
class Indexer:
def __init__(self, index_path):
@dataclass
class TokenizedDocument(Document):
tokens: List[str]
class TinyIndexBase:
def __init__(self, num_pages, page_size):
self.num_pages = num_pages
self.page_size = page_size
self.decompressor = ZstdDecompressor()
self.mmap = None
def _get_page(self, i):
"""
Get the page at index i, decompress and deserialise it using JSON
"""
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
try:
decompressed_data = self.decompressor.decompress(page_data)
except ZstdError:
return None
return json.loads(decompressed_data.decode('utf8'))
class TinyIndex(TinyIndexBase):
def __init__(self, index_path, num_pages, page_size):
super().__init__(num_pages, page_size)
self.index_path = index_path
self.index_file = None
self.mmap = None
def index(self, pages: List[Page]):
with sqlite3.connect(self.index_path) as con:
cursor = con.execute("""
SELECT max(id) FROM pages
""")
current_id = cursor.fetchone()[0]
if current_id is None:
first_page_id = 1
else:
first_page_id = current_id + 1
page_ids = range(first_page_id, first_page_id + len(pages))
urls_titles_ids = ((page.url, page.title, page_id)
for page, page_id in zip(pages, page_ids))
con.executemany("""
INSERT INTO pages (url, title, id)
VALUES (?, ?, ?)
""", urls_titles_ids)
class TinyIndexer(TinyIndexBase):
def __init__(self, index_path, num_pages, page_size):
super().__init__(num_pages, page_size)
self.index_path = index_path
self.compressor = ZstdCompressor()
self.decompressor = ZstdDecompressor()
self.index_file = None
tokens = chain(*([(term, page_id) for term in page.tokens]
for page, page_id in zip(pages, page_ids)))
con.executemany("""
INSERT INTO terms (term, page_id)
VALUES (?, ?)
""", tokens)
def __enter__(self):
self.create_if_not_exists()
self.index_file = open(self.index_path, 'r+b')
self.mmap = mmap(self.index_file.fileno(), 0)
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.mmap.close()
self.index_file.close()
def index(self, documents: List[TokenizedDocument]):
for document in documents:
for token in document.tokens:
self._index_document(document, token)
def _index_document(self, document: Document, token: str):
page_index = self._get_token_page_index(token)
current_page = self._get_page(page_index)
if current_page is None:
current_page = []
current_page.append([document.title, document.url])
try:
self._write_page(current_page, page_index)
except ValueError:
pass
def _get_token_page_index(self, token):
token_hash = mmh3.hash(token, signed=False)
return token_hash % self.num_pages
def _write_page(self, data, i):
"""
Serialise the data using JSON, compress it and store it at index i.
If the data is too big, it will raise a ValueError and not store anything
"""
serialised_data = json.dumps(data)
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
page_length = len(compressed_data)
if page_length > self.page_size:
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
padding = b'\x00' * (self.page_size - page_length)
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
def create_if_not_exists(self):
con = sqlite3.connect(self.index_path)
con.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT
)
""")
if not os.path.isfile(self.index_path):
file_length = self.num_pages * self.page_size
with open(self.index_path, 'wb') as index_file:
index_file.write(b'\x00' * file_length)
con.execute("""
CREATE TABLE IF NOT EXISTS terms (
term TEXT,
page_id INTEGER
)
""")
con.execute("""
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
""")
def page_indexed(self, url):
con = sqlite3.connect(self.index_path)
result = con.execute("""
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
""", (url,))
value = result.fetchone()[0]
return value == 1
def document_indexed(self, url):
raise NotImplementedError()
def get_num_tokens(self):
con = sqlite3.connect(self.index_path)
cursor = con.execute("""
SELECT count(*) from terms
""")
num_terms = cursor.fetchone()[0]
return num_terms
raise NotImplementedError()
def get_random_terms(self, n):
con = sqlite3.connect(self.index_path)
cursor = con.execute("""
SELECT DISTINCT term FROM terms
ORDER BY random() LIMIT ?
""")
terms = [t[0] for t in cursor.fetchall()]
return terms
raise NotImplementedError()
def run():
indexer = Indexer(INDEX_PATH)
indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
indexer.create_if_not_exists()
nlp = English()
for path in glob(CRAWL_GLOB):
@ -136,7 +165,7 @@ def run():
url = html_file.readline().strip()
content = html_file.read()
if indexer.page_indexed(url):
if indexer.document_indexed(url):
print("Page exists, skipping", url)
continue
@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(nlp, prepared_url)
tokens = title_tokens | url_tokens
yield Page(list(tokens), url, title_cleaned)
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
if i % 1000 == 0:
print("Processed", i)
@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
yield chunk
def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
indexer.create_if_not_exists()
pages = get_pages(nlp, titles_and_urls)

View file

@ -5,7 +5,7 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')

View file

@ -10,12 +10,12 @@ from spacy.lang.en import English
from starlette.testclient import TestClient
from app import app, complete
from index import Indexer, index_titles_and_urls
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
from paths import TEST_INDEX_PATH
from wiki import get_wiki_titles_and_urls
NUM_PAGES = 500
NUM_DOCUMENTS = 10000
def query_test():
@ -25,7 +25,7 @@ def query_test():
start = datetime.now()
hits = 0
for title, url in islice(titles_and_urls, NUM_PAGES):
for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
result = client.get('/complete', params={'q': title})
assert result.status_code == 200
data = result.content.decode('utf8')
@ -36,7 +36,7 @@ def query_test():
end = datetime.now()
print("Hits:", hits)
print("Query time:", (end - start).total_seconds()/NUM_PAGES)
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
def performance_test():
@ -45,23 +45,23 @@ def performance_test():
os.remove(TEST_INDEX_PATH)
except FileNotFoundError:
print("No test index found, creating")
indexer = Indexer(TEST_INDEX_PATH)
titles_and_urls = get_wiki_titles_and_urls()
titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
titles_and_urls = get_wiki_titles_and_urls()
titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
start_time = datetime.now()
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
stop_time = datetime.now()
start_time = datetime.now()
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
stop_time = datetime.now()
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
print("Indexed pages:", NUM_PAGES)
print("Indexed pages:", NUM_DOCUMENTS)
print("Index time:", index_time)
print("Index size", index_size)
print("Num tokens", indexer.get_num_tokens())
# print("Num tokens", indexer.get_num_tokens())
query_test()
# query_test()
if __name__ == '__main__':

View file

@ -7,7 +7,7 @@ from urllib.parse import quote
from spacy.lang.en import English
from index import Indexer, index_titles_and_urls
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
from paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
@ -17,9 +17,9 @@ TITLE_END = '</title>\n'
def index_wiki():
nlp = English()
indexer = Indexer(INDEX_PATH)
titles_and_urls = get_wiki_titles_and_urls()
index_titles_and_urls(indexer, nlp, titles_and_urls)
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
titles_and_urls = get_wiki_titles_and_urls()
index_titles_and_urls(indexer, nlp, titles_and_urls)
def get_wiki_titles_and_urls():