|
@@ -2,24 +2,32 @@
|
|
|
Create a search index
|
|
|
"""
|
|
|
import gzip
|
|
|
+import json
|
|
|
+import os
|
|
|
import sqlite3
|
|
|
from dataclasses import dataclass
|
|
|
from glob import glob
|
|
|
from itertools import chain, count, islice
|
|
|
+from mmap import mmap
|
|
|
from typing import List, Iterator
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
import bs4
|
|
|
import justext
|
|
|
+import mmh3
|
|
|
from spacy.lang.en import English
|
|
|
+from zstandard import ZstdCompressor, ZstdDecompressor, ZstdError
|
|
|
|
|
|
from paths import CRAWL_GLOB, INDEX_PATH
|
|
|
|
|
|
+NUM_PAGES = 8192
|
|
|
+PAGE_SIZE = 512
|
|
|
+
|
|
|
NUM_INITIAL_TOKENS = 50
|
|
|
|
|
|
HTTP_START = 'http://'
|
|
|
HTTPS_START = 'https://'
|
|
|
-BATCH_SIZE = 10000
|
|
|
+BATCH_SIZE = 100
|
|
|
|
|
|
|
|
|
def is_content_token(nlp, token):
|
|
@@ -43,91 +51,112 @@ def clean(content):
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
-class Page:
|
|
|
- tokens: List[str]
|
|
|
+class Document:
|
|
|
url: str
|
|
|
title: str
|
|
|
|
|
|
|
|
|
-class Indexer:
|
|
|
- def __init__(self, index_path):
|
|
|
+@dataclass
|
|
|
+class TokenizedDocument(Document):
|
|
|
+ tokens: List[str]
|
|
|
+
|
|
|
+
|
|
|
+class TinyIndexBase:
|
|
|
+ def __init__(self, num_pages, page_size):
|
|
|
+ self.num_pages = num_pages
|
|
|
+ self.page_size = page_size
|
|
|
+ self.decompressor = ZstdDecompressor()
|
|
|
+ self.mmap = None
|
|
|
+
|
|
|
+ def _get_page(self, i):
|
|
|
+ """
|
|
|
+ Get the page at index i, decompress and deserialise it using JSON
|
|
|
+ """
|
|
|
+ page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
|
|
|
+ try:
|
|
|
+ decompressed_data = self.decompressor.decompress(page_data)
|
|
|
+ except ZstdError:
|
|
|
+ return None
|
|
|
+ return json.loads(decompressed_data.decode('utf8'))
|
|
|
+
|
|
|
+
|
|
|
+class TinyIndex(TinyIndexBase):
|
|
|
+ def __init__(self, index_path, num_pages, page_size):
|
|
|
+ super().__init__(num_pages, page_size)
|
|
|
self.index_path = index_path
|
|
|
+ self.index_file = None
|
|
|
+ self.mmap = None
|
|
|
|
|
|
- def index(self, pages: List[Page]):
|
|
|
- with sqlite3.connect(self.index_path) as con:
|
|
|
- cursor = con.execute("""
|
|
|
- SELECT max(id) FROM pages
|
|
|
- """)
|
|
|
- current_id = cursor.fetchone()[0]
|
|
|
- if current_id is None:
|
|
|
- first_page_id = 1
|
|
|
- else:
|
|
|
- first_page_id = current_id + 1
|
|
|
-
|
|
|
- page_ids = range(first_page_id, first_page_id + len(pages))
|
|
|
- urls_titles_ids = ((page.url, page.title, page_id)
|
|
|
- for page, page_id in zip(pages, page_ids))
|
|
|
- con.executemany("""
|
|
|
- INSERT INTO pages (url, title, id)
|
|
|
- VALUES (?, ?, ?)
|
|
|
- """, urls_titles_ids)
|
|
|
-
|
|
|
- tokens = chain(*([(term, page_id) for term in page.tokens]
|
|
|
- for page, page_id in zip(pages, page_ids)))
|
|
|
- con.executemany("""
|
|
|
- INSERT INTO terms (term, page_id)
|
|
|
- VALUES (?, ?)
|
|
|
- """, tokens)
|
|
|
+
|
|
|
+class TinyIndexer(TinyIndexBase):
|
|
|
+ def __init__(self, index_path, num_pages, page_size):
|
|
|
+ super().__init__(num_pages, page_size)
|
|
|
+ self.index_path = index_path
|
|
|
+ self.compressor = ZstdCompressor()
|
|
|
+ self.decompressor = ZstdDecompressor()
|
|
|
+ self.index_file = None
|
|
|
+
|
|
|
+ def __enter__(self):
|
|
|
+ self.create_if_not_exists()
|
|
|
+ self.index_file = open(self.index_path, 'r+b')
|
|
|
+ self.mmap = mmap(self.index_file.fileno(), 0)
|
|
|
+ return self
|
|
|
+
|
|
|
+ def __exit__(self, exc_type, exc_val, exc_tb):
|
|
|
+ self.mmap.close()
|
|
|
+ self.index_file.close()
|
|
|
+
|
|
|
+ def index(self, documents: List[TokenizedDocument]):
|
|
|
+ for document in documents:
|
|
|
+ for token in document.tokens:
|
|
|
+ self._index_document(document, token)
|
|
|
+
|
|
|
+ def _index_document(self, document: Document, token: str):
|
|
|
+ page_index = self._get_token_page_index(token)
|
|
|
+ current_page = self._get_page(page_index)
|
|
|
+ if current_page is None:
|
|
|
+ current_page = []
|
|
|
+ current_page.append([document.title, document.url])
|
|
|
+ try:
|
|
|
+ self._write_page(current_page, page_index)
|
|
|
+ except ValueError:
|
|
|
+ pass
|
|
|
+
|
|
|
+ def _get_token_page_index(self, token):
|
|
|
+ token_hash = mmh3.hash(token, signed=False)
|
|
|
+ return token_hash % self.num_pages
|
|
|
+
|
|
|
+ def _write_page(self, data, i):
|
|
|
+ """
|
|
|
+ Serialise the data using JSON, compress it and store it at index i.
|
|
|
+ If the data is too big, it will raise a ValueError and not store anything
|
|
|
+ """
|
|
|
+ serialised_data = json.dumps(data)
|
|
|
+ compressed_data = self.compressor.compress(serialised_data.encode('utf8'))
|
|
|
+ page_length = len(compressed_data)
|
|
|
+ if page_length > self.page_size:
|
|
|
+ raise ValueError(f"Data is too big ({page_length}) for page size ({self.page_size})")
|
|
|
+ padding = b'\x00' * (self.page_size - page_length)
|
|
|
+ self.mmap[i * self.page_size:(i+1) * self.page_size] = compressed_data + padding
|
|
|
|
|
|
def create_if_not_exists(self):
|
|
|
- con = sqlite3.connect(self.index_path)
|
|
|
- con.execute("""
|
|
|
- CREATE TABLE IF NOT EXISTS pages (
|
|
|
- id INTEGER PRIMARY KEY,
|
|
|
- url TEXT UNIQUE,
|
|
|
- title TEXT
|
|
|
- )
|
|
|
- """)
|
|
|
-
|
|
|
- con.execute("""
|
|
|
- CREATE TABLE IF NOT EXISTS terms (
|
|
|
- term TEXT,
|
|
|
- page_id INTEGER
|
|
|
- )
|
|
|
- """)
|
|
|
-
|
|
|
- con.execute("""
|
|
|
- CREATE INDEX IF NOT EXISTS term_index ON terms (term)
|
|
|
- """)
|
|
|
-
|
|
|
- def page_indexed(self, url):
|
|
|
- con = sqlite3.connect(self.index_path)
|
|
|
- result = con.execute("""
|
|
|
- SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
|
|
|
- """, (url,))
|
|
|
- value = result.fetchone()[0]
|
|
|
- return value == 1
|
|
|
+ if not os.path.isfile(self.index_path):
|
|
|
+ file_length = self.num_pages * self.page_size
|
|
|
+ with open(self.index_path, 'wb') as index_file:
|
|
|
+ index_file.write(b'\x00' * file_length)
|
|
|
+
|
|
|
+ def document_indexed(self, url):
|
|
|
+ raise NotImplementedError()
|
|
|
|
|
|
def get_num_tokens(self):
|
|
|
- con = sqlite3.connect(self.index_path)
|
|
|
- cursor = con.execute("""
|
|
|
- SELECT count(*) from terms
|
|
|
- """)
|
|
|
- num_terms = cursor.fetchone()[0]
|
|
|
- return num_terms
|
|
|
+ raise NotImplementedError()
|
|
|
|
|
|
def get_random_terms(self, n):
|
|
|
- con = sqlite3.connect(self.index_path)
|
|
|
- cursor = con.execute("""
|
|
|
- SELECT DISTINCT term FROM terms
|
|
|
- ORDER BY random() LIMIT ?
|
|
|
- """)
|
|
|
- terms = [t[0] for t in cursor.fetchall()]
|
|
|
- return terms
|
|
|
+ raise NotImplementedError()
|
|
|
|
|
|
|
|
|
def run():
|
|
|
- indexer = Indexer(INDEX_PATH)
|
|
|
+ indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
|
|
indexer.create_if_not_exists()
|
|
|
nlp = English()
|
|
|
for path in glob(CRAWL_GLOB):
|
|
@@ -136,7 +165,7 @@ def run():
|
|
|
url = html_file.readline().strip()
|
|
|
content = html_file.read()
|
|
|
|
|
|
- if indexer.page_indexed(url):
|
|
|
+ if indexer.document_indexed(url):
|
|
|
print("Page exists, skipping", url)
|
|
|
continue
|
|
|
|
|
@@ -169,7 +198,7 @@ def get_pages(nlp, titles_and_urls):
|
|
|
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
|
|
url_tokens = tokenize(nlp, prepared_url)
|
|
|
tokens = title_tokens | url_tokens
|
|
|
- yield Page(list(tokens), url, title_cleaned)
|
|
|
+ yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned)
|
|
|
|
|
|
if i % 1000 == 0:
|
|
|
print("Processed", i)
|
|
@@ -183,7 +212,7 @@ def grouper(n: int, iterator: Iterator):
|
|
|
yield chunk
|
|
|
|
|
|
|
|
|
-def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
|
|
|
+def index_titles_and_urls(indexer: TinyIndexer, nlp, titles_and_urls):
|
|
|
indexer.create_if_not_exists()
|
|
|
|
|
|
pages = get_pages(nlp, titles_and_urls)
|