Browse Source

Optimise queries

Daoud Clarke 4 years ago
parent
commit
d6809fc6f4
3 changed files with 50 additions and 12 deletions
  1. 10 11
      app.py
  2. 9 0
      index.py
  3. 31 1
      performance.py

+ 10 - 11
app.py

@@ -33,7 +33,7 @@ def complete_term(term):
     """
     result = con.execute(query, (term,))
     completed = result.fetchone()
-    print("Completed", completed)
+    # print("Completed", completed)
     if len(completed) > 0:
         return completed[0]
     return None
@@ -49,24 +49,23 @@ def complete(q: str):
     con = sqlite3.connect(INDEX_PATH)
     in_part = ','.join('?'*len(terms))
     query = f"""
-        SELECT title, url, count(*), length(title)
+        SELECT title, url
         FROM terms INNER JOIN pages
         ON terms.page_id = pages.id
         WHERE term IN ({in_part})
         GROUP BY title, url
-        ORDER BY 3 DESC, 4
-        LIMIT 20
+        ORDER BY count(*) DESC, length(title)
+        LIMIT 5
     """
 
-    data = pd.read_sql(query, con, params=terms)
-    results = data.apply(lambda row: row.title.replace("\n", "") + ' — ' +
-                                     row.url.replace("\n", ""), axis=1)
+    data = con.execute(query, terms).fetchall()
+
+    results = [title.replace("\n", "") + ' — ' +
+               url.replace("\n", "") for title, url in data]
     if len(results) == 0:
         return []
-    results_list = results.to_list()[:5]
-    results_list = [q, results_list]
-    print("Results", results_list)
-    return results_list
+    # print("Results", results_list)
+    return [q, results]
 
 
 @app.get('/')

+ 9 - 0
index.py

@@ -116,6 +116,15 @@ class Indexer:
         num_terms = cursor.fetchone()[0]
         return num_terms
 
+    def get_random_terms(self, n):
+        con = sqlite3.connect(self.index_path)
+        cursor = con.execute("""
+            SELECT DISTINCT term FROM terms
+            ORDER BY random() LIMIT ?
+        """)
+        terms = [t[0] for t in cursor.fetchall()]
+        return terms
+
 
 def run():
     indexer = Indexer(INDEX_PATH)

+ 31 - 1
performance.py

@@ -1,17 +1,44 @@
 """
 Test the performance of the search in terms of compression and speed.
 """
+import json
 import os
 from datetime import datetime
 from itertools import islice
 
 from spacy.lang.en import English
+from starlette.testclient import TestClient
 
+from app import app, complete
 from index import Indexer, index_titles_and_urls
 from paths import TEST_INDEX_PATH
 from wiki import get_wiki_titles_and_urls
 
 
+NUM_PAGES = 500
+
+
+def query_test():
+    titles_and_urls = get_wiki_titles_and_urls()
+
+    # client = TestClient(app)
+
+    start = datetime.now()
+    hits = 0
+    for title, url in islice(titles_and_urls, NUM_PAGES):
+        # result = client.get('/complete', params={'q': title})
+        # assert result.status_code == 200
+        # data = result.content.decode('utf8')
+        data = json.dumps(complete(title))
+
+        if url in data:
+            hits += 1
+
+    end = datetime.now()
+    print("Hits:", hits)
+    print("Query time:", (end - start).total_seconds()/NUM_PAGES)
+
+
 def performance_test():
     nlp = English()
     try:
@@ -20,7 +47,7 @@ def performance_test():
         print("No test index found, creating")
     indexer = Indexer(TEST_INDEX_PATH)
     titles_and_urls = get_wiki_titles_and_urls()
-    titles_and_urls_slice = islice(titles_and_urls, 50000)
+    titles_and_urls_slice = islice(titles_and_urls, NUM_PAGES)
 
     start_time = datetime.now()
     index_titles_and_urls(indexer, nlp, titles_and_urls_slice)
@@ -29,10 +56,13 @@ def performance_test():
     index_time = (stop_time - start_time).total_seconds()
     index_size = os.path.getsize(TEST_INDEX_PATH)
 
+    print("Indexed pages:", NUM_PAGES)
     print("Index time:", index_time)
     print("Index size", index_size)
     print("Num tokens", indexer.get_num_tokens())
 
+    query_test()
+
 
 if __name__ == '__main__':
     performance_test()