Improve indexing; measure performance

This commit is contained in:
Daoud Clarke 2021-03-23 22:03:48 +00:00
parent 0c5bc061ae
commit 14f820ff37
4 changed files with 139 additions and 65 deletions

126
index.py
View file

@ -4,6 +4,7 @@ Create a search index
import gzip
import sqlite3
from glob import glob
from urllib.parse import unquote
import bs4
import justext
@ -13,10 +14,13 @@ from paths import CRAWL_GLOB, INDEX_PATH
NUM_INITIAL_TOKENS = 50
HTTP_START = 'http://'
HTTPS_START = 'https://'
def is_content_token(nlp, token):
lexeme = nlp.vocab[token.orth]
return lexeme.is_alpha and not token.is_stop
return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
def tokenize(nlp, cleaned_text):
@ -34,58 +38,60 @@ def clean(content):
return cleaned_text
def index(tokens, url, title):
with sqlite3.connect(INDEX_PATH) as con:
class Indexer:
def __init__(self, index_path):
self.index_path = index_path
def index(self, tokens, url, title):
with sqlite3.connect(self.index_path) as con:
con.execute("""
INSERT INTO pages (url, title)
VALUES (?, ?)
""", (url, title))
result = con.execute("""
SELECT last_insert_rowid()
""")
page_id = result.fetchone()[0]
con.executemany("""
INSERT INTO terms (term, page_id)
VALUES (?, ?)
""", [(term, page_id) for term in tokens])
def create_if_not_exists(self):
con = sqlite3.connect(self.index_path)
con.execute("""
INSERT INTO pages (url, title)
VALUES (?, ?)
""", (url, title))
result = con.execute("""
SELECT last_insert_rowid()
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT
)
""")
page_id = result.fetchone()[0]
print("Created page with id", page_id)
con.executemany("""
INSERT INTO terms (term, page_id)
VALUES (?, ?)
""", [(term, page_id) for term in tokens])
con.execute("""
CREATE TABLE IF NOT EXISTS terms (
term TEXT,
page_id INTEGER
)
""")
con.execute("""
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
""")
def create_if_not_exists():
con = sqlite3.connect(INDEX_PATH)
con.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS terms (
term TEXT,
page_id INTEGER
)
""")
con.execute("""
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
""")
def page_indexed(url):
con = sqlite3.connect(INDEX_PATH)
result = con.execute("""
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
""", (url,))
value = result.fetchone()[0]
return value == 1
def page_indexed(self, url):
con = sqlite3.connect(self.index_path)
result = con.execute("""
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
""", (url,))
value = result.fetchone()[0]
return value == 1
def run():
create_if_not_exists()
indexer = Indexer(INDEX_PATH)
indexer.create_if_not_exists()
nlp = English()
for path in glob(CRAWL_GLOB):
print("Path", path)
@ -93,7 +99,7 @@ def run():
url = html_file.readline().strip()
content = html_file.read()
if page_indexed(url):
if indexer.page_indexed(url):
print("Page exists, skipping", url)
continue
@ -106,7 +112,33 @@ def run():
print("URL", url)
print("Tokens", tokens)
print("Title", title)
index(tokens, url, title)
indexer.index(tokens, url, title)
def prepare_url_for_tokenizing(url: str):
if url.startswith(HTTP_START):
url = url[len(HTTP_START):]
elif url.startswith(HTTPS_START):
url = url[len(HTTPS_START):]
for c in '/._':
if c in url:
url = url.replace(c, ' ')
return url
def index_titles_and_urls(indexer, nlp, titles_and_urls):
indexer.create_if_not_exists()
for i, (title_cleaned, url) in enumerate(titles_and_urls):
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(nlp, prepared_url)
tokens = title_tokens | url_tokens
if len(title_tokens) > 0:
indexer.index(tokens, url, title_cleaned)
if i % 1000 == 0:
print("Processed", i)
if __name__ == '__main__':

View file

@ -6,5 +6,6 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')

37
performance.py Normal file
View file

@ -0,0 +1,37 @@
"""
Test the performance of the search in terms of compression and speed.
"""
import os
from datetime import datetime
from itertools import islice
from spacy.lang.en import English
from index import Indexer, index_titles_and_urls
from paths import TEST_INDEX_PATH
from wiki import get_wiki_titles_and_urls
def performance_test():
nlp = English()
try:
os.remove(TEST_INDEX_PATH)
except FileNotFoundError:
print("No test index found, creating")
indexer = Indexer(TEST_INDEX_PATH)
titles_and_urls = get_wiki_titles_and_urls()
titles_and_urls_slice = islice(titles_and_urls, 1000)
start_time = datetime.now()
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
stop_time = datetime.now()
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
print("Index time:", index_time)
print("Index size", index_size)
if __name__ == '__main__':
performance_test()

38
wiki.py
View file

@ -1,35 +1,39 @@
"""
Index Wikipedia
"""
import bz2
import gzip
from xml.etree.ElementTree import XMLParser
import html
from urllib.parse import quote
from mediawiki_parser import preprocessor, text
from spacy.lang.en import English
from index import tokenize, index
from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH
from index import Indexer, index_titles_and_urls
from paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
TITLE_START = '<title>Wikipedia: '
TITLE_END = '</title>\n'
def index_wiki():
nlp = English()
indexed = 0
indexer = Indexer(INDEX_PATH)
titles_and_urls = get_wiki_titles_and_urls()
index_titles_and_urls(indexer, nlp, titles_and_urls)
def get_wiki_titles_and_urls():
start_len = len(TITLE_START)
end_len = len(TITLE_END)
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
wiki_titles_file.readline()
for title in wiki_titles_file:
title_cleaned = title.replace('_', ' ')
tokens = tokenize(nlp, title_cleaned)
if len(tokens) > 0:
indexed += 1
url = 'https://en.wikipedia.org/wiki/' + title
index(tokens, url, title_cleaned)
if indexed % 1000 == 0:
print("Indexed", indexed)
for raw_title in wiki_titles_file:
assert raw_title.startswith(TITLE_START)
assert raw_title.endswith(TITLE_END)
title = raw_title[start_len:-end_len]
unescaped_title = html.unescape(title)
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
yield unescaped_title, url
if __name__ == '__main__':