Daoud Clarke преди 3 години
родител
ревизия
f5b20d0128
променени са 6 файла, в които са добавени 15 реда и са изтрити 202 реда
  1. 0 29
      analyse/make_curl.py
  2. 0 120
      analyse/performance.py
  3. 8 7
      mwmbl/indexer/index.py
  4. 5 4
      mwmbl/indexer/index_crawl.py
  5. 0 41
      mwmbl/indexer/wiki.py
  6. 2 1
      mwmbl/tinysearchengine/indexer.py

+ 0 - 29
analyse/make_curl.py

@@ -1,29 +0,0 @@
-"""
-Make a curl script for testing performance
-"""
-import os
-from itertools import islice
-from urllib.parse import quote
-
-from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR
-from mwmbl.indexer.wiki import get_wiki_titles_and_urls
-
-URL_TEMPLATE = "http://localhost:8000/complete?q={}"
-CURL_FILE = os.path.join(TINYSEARCH_DATA_DIR, "urls.curl")
-
-
-def get_urls():
-    titles_and_urls = get_wiki_titles_and_urls()
-    for title, url in islice(titles_and_urls, 100):
-        query = quote(title.lower())
-        yield URL_TEMPLATE.format(query)
-
-
-def run():
-    with open(CURL_FILE, 'wt') as output_file:
-        for url in get_urls():
-            output_file.write(f'url="{url}"\n')
-
-
-if __name__ == '__main__':
-    run()

+ 0 - 120
analyse/performance.py

@@ -1,120 +0,0 @@
-"""
-Test the performance of the search in terms of compression and speed.
-"""
-import os
-from datetime import datetime
-
-import numpy as np
-from spacy.lang.en import English
-from starlette.testclient import TestClient
-
-from mwmbl.tinysearchengine import create_app
-from mwmbl.indexer.fsqueue import ZstdJsonSerializer
-from mwmbl.indexer.index import index_titles_urls_and_extracts
-from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
-from mwmbl.indexer.paths import TEST_INDEX_PATH, TINYSEARCH_DATA_DIR, TEST_TERMS_PATH
-
-NUM_DOCUMENTS = 30000
-NUM_PAGES_FOR_STATS = 10
-TEST_PAGE_SIZE = 512
-TEST_NUM_PAGES = 1024
-TEST_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'test-urls.zstd')
-RECALL_AT_K = 3
-
-NUM_QUERY_CHARS = 10
-
-
-def get_test_pages():
-    serializer = ZstdJsonSerializer()
-    with open(TEST_DATA_PATH, 'rb') as data_file:
-        data = serializer.deserialize(data_file.read())
-        return [(row['title'], row['url']) for row in data if row['title'] is not None]
-
-
-def query_test():
-    titles_and_urls = get_test_pages()
-    print(f"Got {len(titles_and_urls)} titles and URLs")
-    tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE)
-
-    app = create_app.create()
-    client = TestClient(app)
-
-    start = datetime.now()
-    hits = 0
-    count = 0
-    for title, url in titles_and_urls:
-        query = title[:NUM_QUERY_CHARS]
-        result = client.get('/complete', params={'q': query})
-        assert result.status_code == 200
-        data = result.json()
-
-        hit = False
-        if data:
-            for result in data[1][:RECALL_AT_K]:
-                if url in result:
-                    hit = True
-                    break
-
-        if hit:
-            hits += 1
-        else:
-            print("Miss", data, title, url, sep='\n')
-
-        count += 1
-
-    end = datetime.now()
-    print(f"Hits: {hits} out of {count}")
-    print(f"Recall at {RECALL_AT_K}: {hits/count}")
-    print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS)
-
-
-def page_stats(indexer: TinyIndexer):
-    pages_and_sizes = []
-    for i in range(TEST_NUM_PAGES):
-        page = indexer.get_page(i)
-        if page is not None:
-            pages_and_sizes.append((len(page), page))
-    big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS])
-    return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages
-
-
-def performance_test():
-    nlp = English()
-    try:
-        os.remove(TEST_INDEX_PATH)
-    except FileNotFoundError:
-        print("No test index found, creating")
-    with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer:
-        titles_and_urls = get_test_pages()
-
-        start_time = datetime.now()
-        index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH)
-        stop_time = datetime.now()
-
-        index_time = (stop_time - start_time).total_seconds()
-        index_size = os.path.getsize(TEST_INDEX_PATH)
-
-        page_size_mean, page_size_std, big_pages = page_stats(indexer)
-
-    print("Indexed pages:", NUM_DOCUMENTS)
-    print("Index time:", index_time)
-    print("Index size:", index_size)
-    print("Mean docs per page:", page_size_mean)
-    print("Std err of docs per page:", page_size_std)
-    print("Big pages")
-    print_pages(big_pages)
-    # print("Num tokens", indexer.get_num_tokens())
-
-    query_test()
-
-
-def print_pages(pages):
-    for page in pages:
-        print("Page", page)
-        for title, url in page:
-            print(title, url)
-        print()
-
-
-if __name__ == '__main__':
-    performance_test()

+ 8 - 7
mwmbl/indexer/index.py

@@ -8,10 +8,10 @@ from urllib.parse import unquote
 
 import pandas as pd
 
-# NUM_PAGES = 8192
-# PAGE_SIZE = 512
 from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
 
+DEFAULT_SCORE = 0
+
 HTTP_START = 'http://'
 HTTPS_START = 'https://'
 BATCH_SIZE = 100
@@ -44,7 +44,7 @@ def prepare_url_for_tokenizing(url: str):
     return url
 
 
-def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
+def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
     for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
         title_tokens = tokenize(nlp, title_cleaned)
         prepared_url = prepare_url_for_tokenizing(unquote(url))
@@ -52,7 +52,8 @@ def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]:
         extract_tokens = tokenize(nlp, extract)
         print("Extract tokens", extract_tokens)
         tokens = title_tokens | url_tokens | extract_tokens
-        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract)
+        score = link_counts.get(url, DEFAULT_SCORE)
+        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
 
         if i % 1000 == 0:
             print("Processed", i)
@@ -66,12 +67,12 @@ def grouper(n: int, iterator: Iterator):
         yield chunk
 
 
-def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path):
+def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
     terms = Counter()
-    pages = get_pages(nlp, titles_urls_and_extracts)
+    pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
     for page in pages:
         for token in page.tokens:
-            indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract))
+            indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
         terms.update([t.lower() for t in page.tokens])
 
     term_df = pd.DataFrame({

+ 5 - 4
mwmbl/indexer/index_crawl.py

@@ -1,13 +1,14 @@
 """
 Index data crawled through the Mwmbl crawler.
 """
+import json
 from logging import getLogger
 
 import spacy
 
 from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
 from mwmbl.indexer.index import index_titles_urls_and_extracts
-from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR
+from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
 
 
@@ -16,12 +17,12 @@ logger = getLogger(__name__)
 
 def index_mwmbl_crawl_data():
     nlp = spacy.load("en_core_web_sm")
+    titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
+    link_counts = json.load(open(LINK_COUNT_PATH))
 
     TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
-
     with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
-        titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
-        index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
+        index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
 
 
 def get_mwmbl_crawl_titles_urls_and_extracts():

+ 0 - 41
mwmbl/indexer/wiki.py

@@ -1,41 +0,0 @@
-"""
-Index Wikipedia
-"""
-import gzip
-import html
-from urllib.parse import quote
-
-from spacy.lang.en import English
-
-from .index import index_titles_urls_and_extracts
-from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from .paths import WIKI_TITLES_PATH, INDEX_PATH
-
-TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
-TITLE_START = '<title>Wikipedia: '
-TITLE_END = '</title>\n'
-
-
-def index_wiki():
-    nlp = English()
-    with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
-        titles_and_urls = get_wiki_titles_and_urls()
-        index_titles_urls_and_extracts(indexer, nlp, titles_and_urls)
-
-
-def get_wiki_titles_and_urls():
-    start_len = len(TITLE_START)
-    end_len = len(TITLE_END)
-    with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file:
-        wiki_titles_file.readline()
-        for raw_title in wiki_titles_file:
-            assert raw_title.startswith(TITLE_START)
-            assert raw_title.endswith(TITLE_END)
-            title = raw_title[start_len:-end_len]
-            unescaped_title = html.unescape(title)
-            url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_'))
-            yield unescaped_title, url
-
-
-if __name__ == '__main__':
-    index_wiki()

+ 2 - 1
mwmbl/tinysearchengine/indexer.py

@@ -12,7 +12,7 @@ VERSION = 1
 METADATA_CONSTANT = b'mwmbl-tiny-search'
 METADATA_SIZE = 4096
 
-NUM_PAGES = 76800
+NUM_PAGES = 128000
 PAGE_SIZE = 4096
 
 
@@ -21,6 +21,7 @@ class Document:
     title: str
     url: str
     extract: str
+    score: float
 
 
 @dataclass