Improve indexing; measure performance
This commit is contained in:
parent
0c5bc061ae
commit
14f820ff37
4 changed files with 139 additions and 65 deletions
126
index.py
126
index.py
|
@ -4,6 +4,7 @@ Create a search index
|
|||
import gzip
|
||||
import sqlite3
|
||||
from glob import glob
|
||||
from urllib.parse import unquote
|
||||
|
||||
import bs4
|
||||
import justext
|
||||
|
@ -13,10 +14,13 @@ from paths import CRAWL_GLOB, INDEX_PATH
|
|||
|
||||
NUM_INITIAL_TOKENS = 50
|
||||
|
||||
HTTP_START = 'http://'
|
||||
HTTPS_START = 'https://'
|
||||
|
||||
|
||||
def is_content_token(nlp, token):
|
||||
lexeme = nlp.vocab[token.orth]
|
||||
return lexeme.is_alpha and not token.is_stop
|
||||
return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
|
||||
|
||||
|
||||
def tokenize(nlp, cleaned_text):
|
||||
|
@ -34,58 +38,60 @@ def clean(content):
|
|||
return cleaned_text
|
||||
|
||||
|
||||
def index(tokens, url, title):
|
||||
with sqlite3.connect(INDEX_PATH) as con:
|
||||
class Indexer:
|
||||
def __init__(self, index_path):
|
||||
self.index_path = index_path
|
||||
|
||||
def index(self, tokens, url, title):
|
||||
with sqlite3.connect(self.index_path) as con:
|
||||
con.execute("""
|
||||
INSERT INTO pages (url, title)
|
||||
VALUES (?, ?)
|
||||
""", (url, title))
|
||||
|
||||
result = con.execute("""
|
||||
SELECT last_insert_rowid()
|
||||
""")
|
||||
page_id = result.fetchone()[0]
|
||||
|
||||
con.executemany("""
|
||||
INSERT INTO terms (term, page_id)
|
||||
VALUES (?, ?)
|
||||
""", [(term, page_id) for term in tokens])
|
||||
|
||||
def create_if_not_exists(self):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
con.execute("""
|
||||
INSERT INTO pages (url, title)
|
||||
VALUES (?, ?)
|
||||
""", (url, title))
|
||||
|
||||
result = con.execute("""
|
||||
SELECT last_insert_rowid()
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT
|
||||
)
|
||||
""")
|
||||
page_id = result.fetchone()[0]
|
||||
print("Created page with id", page_id)
|
||||
|
||||
con.executemany("""
|
||||
INSERT INTO terms (term, page_id)
|
||||
VALUES (?, ?)
|
||||
""", [(term, page_id) for term in tokens])
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS terms (
|
||||
term TEXT,
|
||||
page_id INTEGER
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
|
||||
""")
|
||||
|
||||
def create_if_not_exists():
|
||||
con = sqlite3.connect(INDEX_PATH)
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS pages (
|
||||
id INTEGER PRIMARY KEY,
|
||||
url TEXT UNIQUE,
|
||||
title TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE TABLE IF NOT EXISTS terms (
|
||||
term TEXT,
|
||||
page_id INTEGER
|
||||
)
|
||||
""")
|
||||
|
||||
con.execute("""
|
||||
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
|
||||
""")
|
||||
|
||||
|
||||
def page_indexed(url):
|
||||
con = sqlite3.connect(INDEX_PATH)
|
||||
result = con.execute("""
|
||||
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
|
||||
""", (url,))
|
||||
value = result.fetchone()[0]
|
||||
return value == 1
|
||||
def page_indexed(self, url):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
result = con.execute("""
|
||||
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
|
||||
""", (url,))
|
||||
value = result.fetchone()[0]
|
||||
return value == 1
|
||||
|
||||
|
||||
def run():
|
||||
create_if_not_exists()
|
||||
indexer = Indexer(INDEX_PATH)
|
||||
indexer.create_if_not_exists()
|
||||
nlp = English()
|
||||
for path in glob(CRAWL_GLOB):
|
||||
print("Path", path)
|
||||
|
@ -93,7 +99,7 @@ def run():
|
|||
url = html_file.readline().strip()
|
||||
content = html_file.read()
|
||||
|
||||
if page_indexed(url):
|
||||
if indexer.page_indexed(url):
|
||||
print("Page exists, skipping", url)
|
||||
continue
|
||||
|
||||
|
@ -106,7 +112,33 @@ def run():
|
|||
print("URL", url)
|
||||
print("Tokens", tokens)
|
||||
print("Title", title)
|
||||
index(tokens, url, title)
|
||||
indexer.index(tokens, url, title)
|
||||
|
||||
|
||||
def prepare_url_for_tokenizing(url: str):
|
||||
if url.startswith(HTTP_START):
|
||||
url = url[len(HTTP_START):]
|
||||
elif url.startswith(HTTPS_START):
|
||||
url = url[len(HTTPS_START):]
|
||||
for c in '/._':
|
||||
if c in url:
|
||||
url = url.replace(c, ' ')
|
||||
return url
|
||||
|
||||
|
||||
def index_titles_and_urls(indexer, nlp, titles_and_urls):
|
||||
indexer.create_if_not_exists()
|
||||
for i, (title_cleaned, url) in enumerate(titles_and_urls):
|
||||
title_tokens = tokenize(nlp, title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
url_tokens = tokenize(nlp, prepared_url)
|
||||
tokens = title_tokens | url_tokens
|
||||
|
||||
if len(title_tokens) > 0:
|
||||
indexer.index(tokens, url, title_cleaned)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
3
paths.py
3
paths.py
|
@ -6,5 +6,6 @@ HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
|||
CRAWL_PREFIX = 'crawl_'
|
||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
|
||||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.sqlite3')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-all-titles-in-ns0.gz')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
|
37
performance.py
Normal file
37
performance.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Test the performance of the search in terms of compression and speed.
|
||||
"""
|
||||
import os
|
||||
from datetime import datetime
|
||||
from itertools import islice
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import Indexer, index_titles_and_urls
|
||||
from paths import TEST_INDEX_PATH
|
||||
from wiki import get_wiki_titles_and_urls
|
||||
|
||||
|
||||
def performance_test():
|
||||
nlp = English()
|
||||
try:
|
||||
os.remove(TEST_INDEX_PATH)
|
||||
except FileNotFoundError:
|
||||
print("No test index found, creating")
|
||||
indexer = Indexer(TEST_INDEX_PATH)
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
titles_and_urls_slice = islice(titles_and_urls, 1000)
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
|
||||
stop_time = datetime.now()
|
||||
|
||||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
|
||||
print("Index time:", index_time)
|
||||
print("Index size", index_size)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
performance_test()
|
38
wiki.py
38
wiki.py
|
@ -1,35 +1,39 @@
|
|||
"""
|
||||
Index Wikipedia
|
||||
"""
|
||||
import bz2
|
||||
import gzip
|
||||
from xml.etree.ElementTree import XMLParser
|
||||
import html
|
||||
from urllib.parse import quote
|
||||
|
||||
from mediawiki_parser import preprocessor, text
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import tokenize, index
|
||||
from paths import WIKI_DATA_PATH, WIKI_TITLES_PATH
|
||||
from index import Indexer, index_titles_and_urls
|
||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
TITLE_START = '<title>Wikipedia: '
|
||||
TITLE_END = '</title>\n'
|
||||
|
||||
|
||||
def index_wiki():
|
||||
nlp = English()
|
||||
indexed = 0
|
||||
indexer = Indexer(INDEX_PATH)
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls)
|
||||
|
||||
|
||||
def get_wiki_titles_and_urls():
|
||||
start_len = len(TITLE_START)
|
||||
end_len = len(TITLE_END)
|
||||
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
|
||||
wiki_titles_file.readline()
|
||||
for title in wiki_titles_file:
|
||||
title_cleaned = title.replace('_', ' ')
|
||||
tokens = tokenize(nlp, title_cleaned)
|
||||
|
||||
if len(tokens) > 0:
|
||||
indexed += 1
|
||||
url = 'https://en.wikipedia.org/wiki/' + title
|
||||
index(tokens, url, title_cleaned)
|
||||
|
||||
if indexed % 1000 == 0:
|
||||
print("Indexed", indexed)
|
||||
for raw_title in wiki_titles_file:
|
||||
assert raw_title.startswith(TITLE_START)
|
||||
assert raw_title.endswith(TITLE_END)
|
||||
title = raw_title[start_len:-end_len]
|
||||
unescaped_title = html.unescape(title)
|
||||
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
|
||||
yield unescaped_title, url
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
Loading…
Add table
Reference in a new issue