Optimise queries

This commit is contained in:
Daoud Clarke 2021-03-25 08:38:09 +00:00
parent 3859b85fc8
commit d6809fc6f4
3 changed files with 50 additions and 12 deletions

21
app.py
View file

@ -33,7 +33,7 @@ def complete_term(term):
"""
result = con.execute(query, (term,))
completed = result.fetchone()
print("Completed", completed)
# print("Completed", completed)
if len(completed) > 0:
return completed[0]
return None
@ -49,24 +49,23 @@ def complete(q: str):
con = sqlite3.connect(INDEX_PATH)
in_part = ','.join('?'*len(terms))
query = f"""
SELECT title, url, count(*), length(title)
SELECT title, url
FROM terms INNER JOIN pages
ON terms.page_id = pages.id
WHERE term IN ({in_part})
GROUP BY title, url
ORDER BY 3 DESC, 4
LIMIT 20
ORDER BY count(*) DESC, length(title)
LIMIT 5
"""
data = pd.read_sql(query, con, params=terms)
results = data.apply(lambda row: row.title.replace("\n", "") + '' +
row.url.replace("\n", ""), axis=1)
data = con.execute(query, terms).fetchall()
results = [title.replace("\n", "") + '' +
url.replace("\n", "") for title, url in data]
if len(results) == 0:
return []
results_list = results.to_list()[:5]
results_list = [q, results_list]
print("Results", results_list)
return results_list
# print("Results", results_list)
return [q, results]
@app.get('/')

View file

@ -116,6 +116,15 @@ class Indexer:
num_terms = cursor.fetchone()[0]
return num_terms
def get_random_terms(self, n):
con = sqlite3.connect(self.index_path)
cursor = con.execute("""
SELECT DISTINCT term FROM terms
ORDER BY random() LIMIT ?
""")
terms = [t[0] for t in cursor.fetchall()]
return terms
def run():
indexer = Indexer(INDEX_PATH)

View file

@ -1,17 +1,44 @@
"""
Test the performance of the search in terms of compression and speed.
"""
import json
import os
from datetime import datetime
from itertools import islice
from spacy.lang.en import English
from starlette.testclient import TestClient
from app import app, complete
from index import Indexer, index_titles_and_urls
from paths import TEST_INDEX_PATH
from wiki import get_wiki_titles_and_urls
NUM_PAGES = 500
def query_test():
titles_and_urls = get_wiki_titles_and_urls()
# client = TestClient(app)
start = datetime.now()
hits = 0
for title, url in islice(titles_and_urls, NUM_PAGES):
# result = client.get('/complete', params={'q': title})
# assert result.status_code == 200
# data = result.content.decode('utf8')
data = json.dumps(complete(title))
if url in data:
hits += 1
end = datetime.now()
print("Hits:", hits)
print("Query time:", (end - start).total_seconds()/NUM_PAGES)
def performance_test():
nlp = English()
try:
@ -20,7 +47,7 @@ def performance_test():
print("No test index found, creating")
indexer = Indexer(TEST_INDEX_PATH)
titles_and_urls = get_wiki_titles_and_urls()
titles_and_urls_slice = islice(titles_and_urls, 50000)
titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
start_time = datetime.now()
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
@ -29,10 +56,13 @@ def performance_test():
index_time = (stop_time - start_time).total_seconds()
index_size = os.path.getsize(TEST_INDEX_PATH)
print("Indexed pages:", NUM_PAGES)
print("Index time:", index_time)
print("Index size", index_size)
print("Num tokens", indexer.get_num_tokens())
query_test()
if __name__ == '__main__':
performance_test()