浏览代码

Merge pull request #71 from mwmbl/fix-missing-scores

Store the best items, not the worst ones
Daoud Clarke 2 年之前
父节点
当前提交
046f86f7e3
共有 3 个文件被更改,包括 2 次插入5 次删除
  1. 1 1
      analyse/inspect_index.py
  2. 0 2
      mwmbl/crawler/urls.py
  3. 1 2
      mwmbl/indexer/index_batches.py

+ 1 - 1
analyse/inspect_index.py

@@ -29,7 +29,7 @@ def store():
 
 def get_items():
     with TinyIndex(Document, INDEX_PATH) as tiny_index:
-        items = tiny_index.retrieve('search')
+        items = tiny_index.retrieve('wikipedia')
         if items:
             for item in items:
                 print("Items", item)

+ 0 - 2
mwmbl/crawler/urls.py

@@ -145,8 +145,6 @@ class URLDatabase:
 
         return [result[0] for result in results]
 
-
-
     def get_url_scores(self, urls: list[str]) -> dict[str, float]:
         sql = f"""
         SELECT url, score FROM urls WHERE url IN %(urls)s

+ 1 - 2
mwmbl/indexer/index_batches.py

@@ -70,7 +70,7 @@ def index_pages(index_path, page_documents):
             existing_documents = indexer.get_page(page)
             seen_urls = set()
             seen_titles = set()
-            sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score)
+            sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
             for document in sorted_documents:
                 if document.title in seen_titles or document.url in seen_urls:
                     continue
@@ -78,7 +78,6 @@ def index_pages(index_path, page_documents):
                 seen_urls.add(document.url)
                 seen_titles.add(document.title)
             indexer.store_in_page(page, new_documents)
-            logger.debug(f"Wrote page {page} with {len(new_documents)} documents")
 
 
 def preprocess_documents(documents, index_path, nlp):