Index using compression
This commit is contained in:
parent
634e490cff
commit
acc2a9194e
4 changed files with 121 additions and 92 deletions
171
index.py
171
index.py
|
@ -2,24 +2,32 @@
|
|||
Create a search index
|
||||
"""
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
from dataclasses import dataclass
|
||||
from glob import glob
|
||||
from itertools import chain, count, islice
|
||||
from mmap import mmap
|
||||
from typing import List, Iterator
|
||||
from urllib.parse import unquote
|
||||
|
||||
import bs4
|
||||
import justext
|
||||
import mmh3
|
||||
from spacy.lang.en import English
|
||||
from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
|
||||
|
||||
from paths import CRAWL_GLOB, INDEX_PATH
|
||||
|
||||
NUM_PAGES = 8192
|
||||
PAGE_SIZE = 512
|
||||
|
||||
NUM_INITIAL_TOKENS = 50
|
||||
|
||||
HTTP_START = 'http://'
|
||||
HTTPS_START = 'https://'
|
||||
BATCH_SIZE = 10000
|
||||
BATCH_SIZE = 100
|
||||
|
||||
|
||||
def is_content_token(nlp, token):
|
||||
|
@ -43,91 +51,112 @@ def clean(content):
|
|||
|
||||
|
||||
@dataclass
|
||||
class Page:
|
||||
tokens: List[str]
|
||||
class Document:
|
||||
url: str
|
||||
title: str
|
||||
|
||||
|
||||
class Indexer:
|
||||
def __init__(self, index_path):
|
||||
@dataclass
|
||||
class TokenizedDocument(Document):
|
||||
tokens: List[str]
|
||||
|
||||
|
||||
class TinyIndexBase:
|
||||
def __init__(self, num_pages, page_size):
|
||||
self.num_pages = num_pages
|
||||
self.page_size = page_size
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.mmap = None
|
||||
|
||||
def _get_page(self, i):
|
||||
"""
|
||||
Get the page at index i, decompress and deserialise it using JSON
|
||||
"""
|
||||
page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
||||
try:
|
||||
decompressed_data = self.decompressor.decompress(page_data)
|
||||
except ZstdError:
|
||||
return None
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
|
||||
class TinyIndex(TinyIndexBase):
|
||||
def __init__(self, index_path, num_pages, page_size):
|
||||
super().__init__(num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.index_file = None
|
||||
self.mmap = None
|
||||
|
||||
def index(self, pages: List[Page]):
|
||||
with sqlite3.connect(self.index_path) as con:
|
||||
cursor = con.execute("""
|
||||
SELECT max(id) FROM pages
|
||||
""")
|
||||
current_id = cursor.fetchone()[0]
|
||||
if current_id is None:
|
||||
first_page_id = 1
|
||||
else:
|
||||
first_page_id = current_id + 1
|
||||
|
||||
page_ids = range(first_page_id, first_page_id + len(pages))
|
||||
urls_titles_ids = ((page.url, page.title, page_id)
|
||||
for page, page_id in zip(pages, page_ids))
|
||||
con.executemany("""
|
||||
INSERT INTO pages (url, title, id)
|
||||
VALUES (?, ?, ?)
|
||||
""", urls_titles_ids)
|
||||
class TinyIndexer(TinyIndexBase):
|
||||
def __init__(self, index_path, num_pages, page_size):
|
||||
super().__init__(num_pages, page_size)
|
||||
self.index_path = index_path
|
||||
self.compressor = ZstdCompressor()
|
||||
self.decompressor = ZstdDecompressor()
|
||||
self.index_file = None
|
||||
|
||||
tokens = chain(*([(term, page_id) for term in page.tokens]
|
||||
for page, page_id in zip(pages, page_ids)))
|
||||
con.executemany("""
|
||||
INSERT INTO terms (term, page_id)
|
||||
VALUES (?, ?)
|
||||
""", tokens)
|
||||
def __enter__(self):
|
||||
self.create_if_not_exists()
|
||||
self.index_file = open(self.index_path, 'r+b')
|
||||
self.mmap = mmap(self.index_file.fileno(), 0)
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.mmap.close()
|
||||
self.index_file.close()
|
||||
|
||||
def index(self, documents: List[TokenizedDocument]):
|
||||
for document in documents:
|
||||
for token in document.tokens:
|
||||
self._index_document(document, token)
|
||||
|
||||
def _index_document(self, document: Document, token: str):
|
||||
page_index = self._get_token_page_index(token)
|
||||
current_page = self._get_page(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
current_page.append([document.title, document.url])
|
||||
try:
|
||||
self._write_page(current_page, page_index)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def _get_token_page_index(self, token):
|
||||
token_hash = mmh3.hash(token, signed=False)
|
||||
return token_hash % self.num_pages
|
||||
|
||||
def _write_page(self, data, i):
|
||||
"""
|
||||
Serialise the data using JSON, compress it and store it at index i.
|
||||
If the data is too big, it will raise a ValueError and not store anything
|
||||
"""
|
||||
serialised_data = json.dumps(data)
|
||||
compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
||||
page_length = len(compressed_data)
|
||||
if page_length > self.page_size:
|
||||
raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
||||
padding = b'\x00' * (self.page_size - page_length)
|
||||
self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
||||
|
||||
def create_if_not_exists(self):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT
|
||||
)
|
||||
""")
|
||||
if not os.path.isfile(self.index_path):
|
||||
file_length = self.num_pages * self.page_size
|
||||
with open(self.index_path, 'wb') as index_file:
|
||||
index_file.write(b'\x00' * file_length)
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS terms (
|
||||
term TEXT,
|
||||
page_id INTEGER
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
|
||||
""")
|
||||
|
||||
def page_indexed(self, url):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
result = con.execute("""
|
||||
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
|
||||
""", (url,))
|
||||
value = result.fetchone()[0]
|
||||
return value == 1
|
||||
def document_indexed(self, url):
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_num_tokens(self):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
cursor = con.execute("""
|
||||
SELECT count(*) from terms
|
||||
""")
|
||||
num_terms = cursor.fetchone()[0]
|
||||
return num_terms
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_random_terms(self, n):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
cursor = con.execute("""
|
||||
SELECT DISTINCT term FROM terms
|
||||
ORDER BY random() LIMIT ?
|
||||
""")
|
||||
terms = [t[0] for t in cursor.fetchall()]
|
||||
return terms
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
def run():
|
||||
indexer = Indexer(INDEX_PATH)
|
||||
indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
indexer.create_if_not_exists()
|
||||
nlp = English()
|
||||
for path in glob(CRAWL_GLOB):
|
||||
|
@ -136,7 +165,7 @@ def run():
|
|||
url = html_file.readline().strip()
|
||||
content = html_file.read()
|
||||
|
||||
if indexer.page_indexed(url):
|
||||
if indexer.document_indexed(url):
|
||||
print("Page exists, skipping", url)
|
||||
continue
|
||||
|
||||
|
@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
|
|||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(nlp, prepared_url)
|
||||
tokens = title_tokens | url_tokens
|
||||
yield Page(list(tokens), url, title_cleaned)
|
||||
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
|
|||
yield chunk
|
||||
|
||||
|
||||
def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
|
||||
def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
|
||||
indexer.create_if_not_exists()
|
||||
|
||||
pages = get_pages(nlp, titles_and_urls)
|
||||
|
|
4
paths.py
4
paths.py
|
@ -5,7 +5,7 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
|
|||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||
CRAWL_PREFIX = 'crawl_'
|
||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
|
||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
|
||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
|
|
@ -10,12 +10,12 @@ from spacy.lang.en import English
|
|||
from starlette.testclient import TestClient
|
||||
|
||||
from app import app, complete
|
||||
from index import Indexer, index_titles_and_urls
|
||||
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
|
||||
from paths import TEST_INDEX_PATH
|
||||
from wiki import get_wiki_titles_and_urls
|
||||
|
||||
|
||||
NUM_PAGES = 500
|
||||
NUM_DOCUMENTS = 10000
|
||||
|
||||
|
||||
def query_test():
|
||||
|
@ -25,7 +25,7 @@ def query_test():
|
|||
|
||||
start = datetime.now()
|
||||
hits = 0
|
||||
for title, url in islice(titles_and_urls, NUM_PAGES):
|
||||
for title, url in islice(titles_and_urls, NUM_DOCUMENTS):
|
||||
result = client.get('/complete', params={'q': title})
|
||||
assert result.status_code == 200
|
||||
data = result.content.decode('utf8')
|
||||
|
@ -36,7 +36,7 @@ def query_test():
|
|||
|
||||
end = datetime.now()
|
||||
print("Hits:", hits)
|
||||
print("Query time:", (end - start).total_seconds()/NUM_PAGES)
|
||||
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
|
||||
|
||||
|
||||
def performance_test():
|
||||
|
@ -45,23 +45,23 @@ def performance_test():
|
|||
os.remove(TEST_INDEX_PATH)
|
||||
except FileNotFoundError:
|
||||
print("No test index found, creating")
|
||||
indexer = Indexer(TEST_INDEX_PATH)
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
|
||||
with TinyIndexer(TEST_INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
titles_and_urls_slice = islice(titles_and_urls, NUM_DOCUMENTS)
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
|
||||
stop_time = datetime.now()
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
|
||||
stop_time = datetime.now()
|
||||
|
||||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
|
||||
print("Indexed pages:", NUM_PAGES)
|
||||
print("Indexed pages:", NUM_DOCUMENTS)
|
||||
print("Index time:", index_time)
|
||||
print("Index size", index_size)
|
||||
print("Num tokens", indexer.get_num_tokens())
|
||||
# print("Num tokens", indexer.get_num_tokens())
|
||||
|
||||
query_test()
|
||||
# query_test()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
8
wiki.py
8
wiki.py
|
@ -7,7 +7,7 @@ from urllib.parse import quote
|
|||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import Indexer, index_titles_and_urls
|
||||
from index import TinyIndexer, index_titles_and_urls, PAGE_SIZE, NUM_PAGES
|
||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
|
@ -17,9 +17,9 @@ TITLE_END = '</title>\n'
|
|||
|
||||
def index_wiki():
|
||||
nlp = English()
|
||||
indexer = Indexer(INDEX_PATH)
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
|
||||
|
||||
def get_wiki_titles_and_urls():
|
||||
|
|
Loading…
Add table
Reference in a new issue