Speed up inserts
This commit is contained in:
parent
14f820ff37
commit
3859b85fc8
2 changed files with 58 additions and 16 deletions
71
index.py
71
index.py
|
@ -3,7 +3,10 @@ Create a search index
|
|||
"""
|
||||
import gzip
|
||||
import sqlite3
|
||||
from dataclasses import dataclass
|
||||
from glob import glob
|
||||
from itertools import chain, count, islice
|
||||
from typing import List, Iterator
|
||||
from urllib.parse import unquote
|
||||
|
||||
import bs4
|
||||
|
@ -16,6 +19,7 @@ NUM_INITIAL_TOKENS = 50
|
|||
|
||||
HTTP_START = 'http://'
|
||||
HTTPS_START = 'https://'
|
||||
BATCH_SIZE = 10000
|
||||
|
||||
|
||||
def is_content_token(nlp, token):
|
||||
|
@ -38,26 +42,42 @@ def clean(content):
|
|||
return cleaned_text
|
||||
|
||||
|
||||
@dataclass
|
||||
class Page:
|
||||
tokens: List[str]
|
||||
url: str
|
||||
title: str
|
||||
|
||||
|
||||
class Indexer:
|
||||
def __init__(self, index_path):
|
||||
self.index_path = index_path
|
||||
|
||||
def index(self, tokens, url, title):
|
||||
def index(self, pages: List[Page]):
|
||||
with sqlite3.connect(self.index_path) as con:
|
||||
con.execute("""
|
||||
INSERT INTO pages (url, title)
|
||||
VALUES (?, ?)
|
||||
""", (url, title))
|
||||
cursor = con.execute("""
|
||||
SELECT max(id) FROM pages
|
||||
""")
|
||||
current_id = cursor.fetchone()[0]
|
||||
if current_id is None:
|
||||
first_page_id = 1
|
||||
else:
|
||||
first_page_id = current_id + 1
|
||||
|
||||
result = con.execute("""
|
||||
SELECT last_insert_rowid()
|
||||
""")
|
||||
page_id = result.fetchone()[0]
|
||||
page_ids = range(first_page_id, first_page_id + len(pages))
|
||||
urls_titles_ids = ((page.url, page.title, page_id)
|
||||
for page, page_id in zip(pages, page_ids))
|
||||
con.executemany("""
|
||||
INSERT INTO pages (url, title, id)
|
||||
VALUES (?, ?, ?)
|
||||
""", urls_titles_ids)
|
||||
|
||||
tokens = chain(*([(term, page_id) for term in page.tokens]
|
||||
for page, page_id in zip(pages, page_ids)))
|
||||
con.executemany("""
|
||||
INSERT INTO terms (term, page_id)
|
||||
VALUES (?, ?)
|
||||
""", [(term, page_id) for term in tokens])
|
||||
""", tokens)
|
||||
|
||||
def create_if_not_exists(self):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
|
@ -88,6 +108,14 @@ class Indexer:
|
|||
value = result.fetchone()[0]
|
||||
return value == 1
|
||||
|
||||
def get_num_tokens(self):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
cursor = con.execute("""
|
||||
SELECT count(*) from terms
|
||||
""")
|
||||
num_terms = cursor.fetchone()[0]
|
||||
return num_terms
|
||||
|
||||
|
||||
def run():
|
||||
indexer = Indexer(INDEX_PATH)
|
||||
|
@ -126,20 +154,33 @@ def prepare_url_for_tokenizing(url: str):
|
|||
return url
|
||||
|
||||
|
||||
def index_titles_and_urls(indexer, nlp, titles_and_urls):
|
||||
indexer.create_if_not_exists()
|
||||
def get_pages(nlp, titles_and_urls):
|
||||
for i, (title_cleaned, url) in enumerate(titles_and_urls):
|
||||
title_tokens = tokenize(nlp, title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(nlp, prepared_url)
|
||||
tokens = title_tokens | url_tokens
|
||||
|
||||
if len(title_tokens) > 0:
|
||||
indexer.index(tokens, url, title_cleaned)
|
||||
yield Page(list(tokens), url, title_cleaned)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
||||
|
||||
def grouper(n: int, iterator: Iterator):
|
||||
while True:
|
||||
chunk = tuple(islice(iterator, n))
|
||||
if not chunk:
|
||||
return
|
||||
yield chunk
|
||||
|
||||
|
||||
def index_titles_and_urls(indexer: Indexer, nlp, titles_and_urls):
|
||||
indexer.create_if_not_exists()
|
||||
|
||||
pages = get_pages(nlp, titles_and_urls)
|
||||
for chunk in grouper(BATCH_SIZE, pages):
|
||||
indexer.index(list(chunk))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
|
|
@ -20,7 +20,7 @@ def performance_test():
|
|||
print("No test index found, creating")
|
||||
indexer = Indexer(TEST_INDEX_PATH)
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
titles_and_urls_slice = islice(titles_and_urls, 1000)
|
||||
titles_and_urls_slice = islice(titles_and_urls, 50000)
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
|
||||
|
@ -31,6 +31,7 @@ def performance_test():
|
|||
|
||||
print("Index time:", index_time)
|
||||
print("Index size", index_size)
|
||||
print("Num tokens", indexer.get_num_tokens())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Reference in a new issue