Index link counts
This commit is contained in:
parent
b5b2005323
commit
f5b20d0128
6 changed files with 15 additions and 202 deletions
|
@ -1,29 +0,0 @@
|
|||
"""
|
||||
Make a curl script for testing performance
|
||||
"""
|
||||
import os
|
||||
from itertools import islice
|
||||
from urllib.parse import quote
|
||||
|
||||
from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR
|
||||
from mwmbl.indexer.wiki import get_wiki_titles_and_urls
|
||||
|
||||
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
|
||||
CURL_FILE = os.path.join(TINYSEARCH_DATA_DIR, "urls.curl")
|
||||
|
||||
|
||||
def get_urls():
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
for title, url in islice(titles_and_urls, 100):
|
||||
query = quote(title.lower())
|
||||
yield URL_TEMPLATE.format(query)
|
||||
|
||||
|
||||
def run():
|
||||
with open(CURL_FILE, 'wt') as output_file:
|
||||
for url in get_urls():
|
||||
output_file.write(f'url="{url}"\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
|
@ -1,120 +0,0 @@
|
|||
"""
|
||||
Test the performance of the search in terms of compression and speed.
|
||||
"""
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
import numpy as np
|
||||
from spacy.lang.en import English
|
||||
from starlette.testclient import TestClient
|
||||
|
||||
from mwmbl.tinysearchengine import create_app
|
||||
from mwmbl.indexer.fsqueue import ZstdJsonSerializer
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||
from mwmbl.indexer.paths import TEST_INDEX_PATH, TINYSEARCH_DATA_DIR, TEST_TERMS_PATH
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
||||
TEST_PAGE_SIZE = 512
|
||||
TEST_NUM_PAGES = 1024
|
||||
TEST_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'test-urls.zstd')
|
||||
RECALL_AT_K = 3
|
||||
|
||||
NUM_QUERY_CHARS = 10
|
||||
|
||||
|
||||
def get_test_pages():
|
||||
serializer = ZstdJsonSerializer()
|
||||
with open(TEST_DATA_PATH, 'rb') as data_file:
|
||||
data = serializer.deserialize(data_file.read())
|
||||
return [(row['title'], row['url']) for row in data if row['title'] is not None]
|
||||
|
||||
|
||||
def query_test():
|
||||
titles_and_urls = get_test_pages()
|
||||
print(f"Got {len(titles_and_urls)} titles and URLs")
|
||||
tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
|
||||
|
||||
app = create_app.create()
|
||||
client = TestClient(app)
|
||||
|
||||
start = datetime.now()
|
||||
hits = 0
|
||||
count = 0
|
||||
for title, url in titles_and_urls:
|
||||
query = title[:NUM_QUERY_CHARS]
|
||||
result = client.get('/complete', params={'q': query})
|
||||
assert result.status_code == 200
|
||||
data = result.json()
|
||||
|
||||
hit = False
|
||||
if data:
|
||||
for result in data[1][:RECALL_AT_K]:
|
||||
if url in result:
|
||||
hit = True
|
||||
break
|
||||
|
||||
if hit:
|
||||
hits += 1
|
||||
else:
|
||||
print("Miss", data, title, url, sep='\n')
|
||||
|
||||
count += 1
|
||||
|
||||
end = datetime.now()
|
||||
print(f"Hits: {hits} out of {count}")
|
||||
print(f"Recall at {RECALL_AT_K}: {hits/count}")
|
||||
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
|
||||
|
||||
|
||||
def page_stats(indexer: TinyIndexer):
|
||||
pages_and_sizes = []
|
||||
for i in range(TEST_NUM_PAGES):
|
||||
page = indexer.get_page(i)
|
||||
if page is not None:
|
||||
pages_and_sizes.append((len(page), page))
|
||||
big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
|
||||
return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
|
||||
|
||||
|
||||
def performance_test():
|
||||
nlp = English()
|
||||
try:
|
||||
os.remove(TEST_INDEX_PATH)
|
||||
except FileNotFoundError:
|
||||
print("No test index found, creating")
|
||||
with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_test_pages()
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
|
||||
stop_time = datetime.now()
|
||||
|
||||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
|
||||
page_size_mean, page_size_std, big_pages = page_stats(indexer)
|
||||
|
||||
print("Indexed pages:", NUM_DOCUMENTS)
|
||||
print("Index time:", index_time)
|
||||
print("Index size:", index_size)
|
||||
print("Mean docs per page:", page_size_mean)
|
||||
print("Std err of docs per page:", page_size_std)
|
||||
print("Big pages")
|
||||
print_pages(big_pages)
|
||||
# print("Num tokens", indexer.get_num_tokens())
|
||||
|
||||
query_test()
|
||||
|
||||
|
||||
def print_pages(pages):
|
||||
for page in pages:
|
||||
print("Page", page)
|
||||
for title, url in page:
|
||||
print(title, url)
|
||||
print()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
performance_test()
|
|
@ -8,10 +8,10 @@ from urllib.parse import unquote
|
|||
|
||||
import pandas as pd
|
||||
|
||||
# NUM_PAGES = 8192
|
||||
# PAGE_SIZE = 512
|
||||
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
|
||||
|
||||
DEFAULT_SCORE = 0
|
||||
|
||||
HTTP_START = 'http://'
|
||||
HTTPS_START = 'https://'
|
||||
BATCH_SIZE = 100
|
||||
|
@ -44,7 +44,7 @@ def prepare_url_for_tokenizing(url: str):
|
|||
return url
|
||||
|
||||
|
||||
def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
|
||||
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
|
||||
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
|
||||
title_tokens = tokenize(nlp, title_cleaned)
|
||||
prepared_url = prepare_url_for_tokenizing(unquote(url))
|
||||
|
@ -52,7 +52,8 @@ def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
|
|||
extract_tokens = tokenize(nlp, extract)
|
||||
print("Extract tokens", extract_tokens)
|
||||
tokens = title_tokens | url_tokens | extract_tokens
|
||||
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
|
||||
score = link_counts.get(url, DEFAULT_SCORE)
|
||||
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
|
||||
|
||||
if i % 1000 == 0:
|
||||
print("Processed", i)
|
||||
|
@ -66,12 +67,12 @@ def grouper(n: int, iterator: Iterator):
|
|||
yield chunk
|
||||
|
||||
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_urls_and_extracts)
|
||||
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
|
||||
for page in pages:
|
||||
for token in page.tokens:
|
||||
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
|
||||
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
|
||||
term_df = pd.DataFrame({
|
||||
|
|
|
@ -1,13 +1,14 @@
|
|||
"""
|
||||
Index data crawled through the Mwmbl crawler.
|
||||
"""
|
||||
import json
|
||||
from logging import getLogger
|
||||
|
||||
import spacy
|
||||
|
||||
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR
|
||||
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||
|
||||
|
||||
|
@ -16,12 +17,12 @@ logger = getLogger(__name__)
|
|||
|
||||
def index_mwmbl_crawl_data():
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
|
||||
link_counts = json.load(open(LINK_COUNT_PATH))
|
||||
|
||||
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
|
||||
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
|
||||
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
|
||||
|
||||
|
||||
def get_mwmbl_crawl_titles_urls_and_extracts():
|
||||
|
|
|
@ -1,41 +0,0 @@
|
|||
"""
|
||||
Index Wikipedia
|
||||
"""
|
||||
import gzip
|
||||
import html
|
||||
from urllib.parse import quote
|
||||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from .index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from .paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
TITLE_START = '<title>Wikipedia: '
|
||||
TITLE_END = '</title>\n'
|
||||
|
||||
|
||||
def index_wiki():
|
||||
nlp = English()
|
||||
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
|
||||
|
||||
|
||||
def get_wiki_titles_and_urls():
|
||||
start_len = len(TITLE_START)
|
||||
end_len = len(TITLE_END)
|
||||
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
|
||||
wiki_titles_file.readline()
|
||||
for raw_title in wiki_titles_file:
|
||||
assert raw_title.startswith(TITLE_START)
|
||||
assert raw_title.endswith(TITLE_END)
|
||||
title = raw_title[start_len:-end_len]
|
||||
unescaped_title = html.unescape(title)
|
||||
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
|
||||
yield unescaped_title, url
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
index_wiki()
|
|
@ -12,7 +12,7 @@ VERSION = 1
|
|||
METADATA_CONSTANT = b'mwmbl-tiny-search'
|
||||
METADATA_SIZE = 4096
|
||||
|
||||
NUM_PAGES = 76800
|
||||
NUM_PAGES = 128000
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
|
@ -21,6 +21,7 @@ class Document:
|
|||
title: str
|
||||
url: str
|
||||
extract: str
|
||||
score: float
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
Loading…
Reference in a new issue