Optimise queries
This commit is contained in:
parent
3859b85fc8
commit
d6809fc6f4
3 changed files with 50 additions and 12 deletions
21
app.py
21
app.py
|
@ -33,7 +33,7 @@ def complete_term(term):
|
|||
"""
|
||||
result = con.execute(query, (term,))
|
||||
completed = result.fetchone()
|
||||
print("Completed", completed)
|
||||
# print("Completed", completed)
|
||||
if len(completed) > 0:
|
||||
return completed[0]
|
||||
return None
|
||||
|
@ -49,24 +49,23 @@ def complete(q: str):
|
|||
con = sqlite3.connect(INDEX_PATH)
|
||||
in_part = ','.join('?'*len(terms))
|
||||
query = f"""
|
||||
SELECT title, url, count(*), length(title)
|
||||
SELECT title, url
|
||||
FROM terms INNER JOIN pages
|
||||
ON terms.page_id = pages.id
|
||||
WHERE term IN ({in_part})
|
||||
GROUP BY title, url
|
||||
ORDER BY 3 DESC, 4
|
||||
LIMIT 20
|
||||
ORDER BY count(*) DESC, length(title)
|
||||
LIMIT 5
|
||||
"""
|
||||
|
||||
data = pd.read_sql(query, con, params=terms)
|
||||
results = data.apply(lambda row: row.title.replace("\n", "") + ' — ' +
|
||||
row.url.replace("\n", ""), axis=1)
|
||||
data = con.execute(query, terms).fetchall()
|
||||
|
||||
results = [title.replace("\n", "") + ' — ' +
|
||||
url.replace("\n", "") for title, url in data]
|
||||
if len(results) == 0:
|
||||
return []
|
||||
results_list = results.to_list()[:5]
|
||||
results_list = [q, results_list]
|
||||
print("Results", results_list)
|
||||
return results_list
|
||||
# print("Results", results_list)
|
||||
return [q, results]
|
||||
|
||||
|
||||
@app.get('/')
|
||||
|
|
9
index.py
9
index.py
|
@ -116,6 +116,15 @@ class Indexer:
|
|||
num_terms = cursor.fetchone()[0]
|
||||
return num_terms
|
||||
|
||||
def get_random_terms(self, n):
|
||||
con = sqlite3.connect(self.index_path)
|
||||
cursor = con.execute("""
|
||||
SELECT DISTINCT term FROM terms
|
||||
ORDER BY random() LIMIT ?
|
||||
""")
|
||||
terms = [t[0] for t in cursor.fetchall()]
|
||||
return terms
|
||||
|
||||
|
||||
def run():
|
||||
indexer = Indexer(INDEX_PATH)
|
||||
|
|
|
@ -1,17 +1,44 @@
|
|||
"""
|
||||
Test the performance of the search in terms of compression and speed.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
from itertools import islice
|
||||
|
||||
from spacy.lang.en import English
|
||||
from starlette.testclient import TestClient
|
||||
|
||||
from app import app, complete
|
||||
from index import Indexer, index_titles_and_urls
|
||||
from paths import TEST_INDEX_PATH
|
||||
from wiki import get_wiki_titles_and_urls
|
||||
|
||||
|
||||
NUM_PAGES = 500
|
||||
|
||||
|
||||
def query_test():
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
|
||||
# client = TestClient(app)
|
||||
|
||||
start = datetime.now()
|
||||
hits = 0
|
||||
for title, url in islice(titles_and_urls, NUM_PAGES):
|
||||
# result = client.get('/complete', params={'q': title})
|
||||
# assert result.status_code == 200
|
||||
# data = result.content.decode('utf8')
|
||||
data = json.dumps(complete(title))
|
||||
|
||||
if url in data:
|
||||
hits += 1
|
||||
|
||||
end = datetime.now()
|
||||
print("Hits:", hits)
|
||||
print("Query time:", (end - start).total_seconds()/NUM_PAGES)
|
||||
|
||||
|
||||
def performance_test():
|
||||
nlp = English()
|
||||
try:
|
||||
|
@ -20,7 +47,7 @@ def performance_test():
|
|||
print("No test index found, creating")
|
||||
indexer = Indexer(TEST_INDEX_PATH)
|
||||
titles_and_urls = get_wiki_titles_and_urls()
|
||||
titles_and_urls_slice = islice(titles_and_urls, 50000)
|
||||
titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
|
||||
|
||||
start_time = datetime.now()
|
||||
index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
|
||||
|
@ -29,10 +56,13 @@ def performance_test():
|
|||
index_time = (stop_time - start_time).total_seconds()
|
||||
index_size = os.path.getsize(TEST_INDEX_PATH)
|
||||
|
||||
print("Indexed pages:", NUM_PAGES)
|
||||
print("Index time:", index_time)
|
||||
print("Index size", index_size)
|
||||
print("Num tokens", indexer.get_num_tokens())
|
||||
|
||||
query_test()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
performance_test()
|
||||
|
|
Loading…
Add table
Reference in a new issue