Store the best items, not the worst ones

This commit is contained in:
Daoud Clarke 2022-07-31 22:55:15 +01:00
parent aa5878fd2f
commit ae658906dd
3 changed files with 2 additions and 5 deletions

View file

@ -29,7 +29,7 @@ def store():
def get_items():
with TinyIndex(Document, INDEX_PATH) as tiny_index:
items = tiny_index.retrieve('search')
items = tiny_index.retrieve('wikipedia')
if items:
for item in items:
print("Items", item)

View file

@ -145,8 +145,6 @@ class URLDatabase:
return [result[0] for result in results]
def get_url_scores(self, urls: list[str]) -> dict[str, float]:
sql = f"""
SELECT url, score FROM urls WHERE url IN %(urls)s

View file

@ -70,7 +70,7 @@ def index_pages(index_path, page_documents):
existing_documents = indexer.get_page(page)
seen_urls = set()
seen_titles = set()
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score)
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
for document in sorted_documents:
if document.title in seen_titles or document.url in seen_urls:
continue
@ -78,7 +78,6 @@ def index_pages(index_path, page_documents):
seen_urls.add(document.url)
seen_titles.add(document.title)
indexer.store_in_page(page, new_documents)
logger.debug(f"Wrote page {page} with {len(new_documents)} documents")
def preprocess_documents(documents, index_path, nlp):