Index link counts

This commit is contained in:
Daoud Clarke 2022-02-24 20:47:36 +00:00
parent b5b2005323
commit f5b20d0128
6 changed files with 15 additions and 202 deletions

View file

@ -1,29 +0,0 @@
"""
Make a curl script for testing performance
"""
import os
from itertools import islice
from urllib.parse import quote
from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR
from mwmbl.indexer.wiki import get_wiki_titles_and_urls
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
CURL_FILE = os.path.join(TINYSEARCH_DATA_DIR, "urls.curl")
def get_urls():
titles_and_urls = get_wiki_titles_and_urls()
for title, url in islice(titles_and_urls, 100):
query = quote(title.lower())
yield URL_TEMPLATE.format(query)
def run():
with open(CURL_FILE, 'wt') as output_file:
for url in get_urls():
output_file.write(f'url="{url}"\n')
if __name__ == '__main__':
run()

View file

@ -1,120 +0,0 @@
"""
Test the performance of the search in terms of compression and speed.
"""
import os
from datetime import datetime
import numpy as np
from spacy.lang.en import English
from starlette.testclient import TestClient
from mwmbl.tinysearchengine import create_app
from mwmbl.indexer.fsqueue import ZstdJsonSerializer
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
from mwmbl.indexer.paths import TEST_INDEX_PATH, TINYSEARCH_DATA_DIR, TEST_TERMS_PATH
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10
TEST_PAGE_SIZE = 512
TEST_NUM_PAGES = 1024
TEST_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'test-urls.zstd')
RECALL_AT_K = 3
NUM_QUERY_CHARS = 10
def get_test_pages():
serializer = ZstdJsonSerializer()
with open(TEST_DATA_PATH, 'rb') as data_file:
data = serializer.deserialize(data_file.read())
return [(row['title'], row['url']) for row in data if row['title'] is not None]
def query_test():
titles_and_urls = get_test_pages()
print(f"Got {len(titles_and_urls)} titles and URLs")
tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
app = create_app.create()
client = TestClient(app)
start = datetime.now()
hits = 0
count = 0
for title, url in titles_and_urls:
query = title[:NUM_QUERY_CHARS]
result = client.get('/complete', params={'q': query})
assert result.status_code == 200
data = result.json()
hit = False
if data:
for result in data[1][:RECALL_AT_K]:
if url in result:
hit = True
break
if hit:
hits += 1
else:
print("Miss", data, title, url, sep='\n')
count += 1
end = datetime.now()
print(f"Hits: {hits} out of {count}")
print(f"Recall at {RECALL_AT_K}: {hits/count}")
print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
def page_stats(indexer: TinyIndexer):
pages_and_sizes = []
for i in range(TEST_NUM_PAGES):
page = indexer.get_page(i)
if page is not None:
pages_and_sizes.append((len(page), page))
big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
def performance_test():
nlp = English()
try:
os.remove(TEST_INDEX_PATH)
except FileNotFoundError:
print("No test index found, creating")
with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
titles_and_urls = get_test_pages()
start_time = datetime.now()
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
stop_time = datetime.now()
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
page_size_mean, page_size_std, big_pages = page_stats(indexer)
print("Indexed pages:", NUM_DOCUMENTS)
print("Index time:", index_time)
print("Index size:", index_size)
print("Mean docs per page:", page_size_mean)
print("Std err of docs per page:", page_size_std)
print("Big pages")
print_pages(big_pages)
# print("Num tokens", indexer.get_num_tokens())
query_test()
def print_pages(pages):
for page in pages:
print("Page", page)
for title, url in page:
print(title, url)
print()
if __name__ == '__main__':
performance_test()

View file

@ -8,10 +8,10 @@ from urllib.parse import unquote
import pandas as pd
# NUM_PAGES = 8192
# PAGE_SIZE = 512
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
DEFAULT_SCORE = 0
HTTP_START = 'http://'
HTTPS_START = 'https://'
BATCH_SIZE = 100
@ -44,7 +44,7 @@ def prepare_url_for_tokenizing(url: str):
return url
def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
@ -52,7 +52,8 @@ def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
extract_tokens = tokenize(nlp, extract)
print("Extract tokens", extract_tokens)
tokens = title_tokens | url_tokens | extract_tokens
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
score = link_counts.get(url, DEFAULT_SCORE)
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
if i % 1000 == 0:
print("Processed", i)
@ -66,12 +67,12 @@ def grouper(n: int, iterator: Iterator):
yield chunk
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
terms = Counter()
pages = get_pages(nlp, titles_urls_and_extracts)
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
for page in pages:
for token in page.tokens:
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
terms.update([t.lower() for t in page.tokens])
term_df = pd.DataFrame({

View file

@ -1,13 +1,14 @@
"""
Index data crawled through the Mwmbl crawler.
"""
import json
from logging import getLogger
import spacy
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
@ -16,12 +17,12 @@ logger = getLogger(__name__)
def index_mwmbl_crawl_data():
nlp = spacy.load("en_core_web_sm")
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
link_counts = json.load(open(LINK_COUNT_PATH))
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
def get_mwmbl_crawl_titles_urls_and_extracts():

View file

@ -1,41 +0,0 @@
"""
Index Wikipedia
"""
import gzip
import html
from urllib.parse import quote
from spacy.lang.en import English
from .index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from .paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
TITLE_START = '<title>Wikipedia: '
TITLE_END = '</title>\n'
def index_wiki():
nlp = English()
with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
titles_and_urls = get_wiki_titles_and_urls()
index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
def get_wiki_titles_and_urls():
start_len = len(TITLE_START)
end_len = len(TITLE_END)
with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
wiki_titles_file.readline()
for raw_title in wiki_titles_file:
assert raw_title.startswith(TITLE_START)
assert raw_title.endswith(TITLE_END)
title = raw_title[start_len:-end_len]
unescaped_title = html.unescape(title)
url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
yield unescaped_title, url
if __name__ == '__main__':
index_wiki()

View file

@ -12,7 +12,7 @@ VERSION = 1
METADATA_CONSTANT = b'mwmbl-tiny-search'
METADATA_SIZE = 4096
NUM_PAGES = 76800
NUM_PAGES = 128000
PAGE_SIZE = 4096
@ -21,6 +21,7 @@ class Document:
title: str
url: str
extract: str
score: float
@dataclass