Store the best items, not the worst ones
This commit is contained in:
parent
aa5878fd2f
commit
ae658906dd
3 changed files with 2 additions and 5 deletions
|
@ -29,7 +29,7 @@ def store():
|
|||
|
||||
def get_items():
|
||||
with TinyIndex(Document, INDEX_PATH) as tiny_index:
|
||||
items = tiny_index.retrieve('search')
|
||||
items = tiny_index.retrieve('wikipedia')
|
||||
if items:
|
||||
for item in items:
|
||||
print("Items", item)
|
||||
|
|
|
@ -145,8 +145,6 @@ class URLDatabase:
|
|||
|
||||
return [result[0] for result in results]
|
||||
|
||||
|
||||
|
||||
def get_url_scores(self, urls: list[str]) -> dict[str, float]:
|
||||
sql = f"""
|
||||
SELECT url, score FROM urls WHERE url IN %(urls)s
|
||||
|
|
|
@ -70,7 +70,7 @@ def index_pages(index_path, page_documents):
|
|||
existing_documents = indexer.get_page(page)
|
||||
seen_urls = set()
|
||||
seen_titles = set()
|
||||
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score)
|
||||
sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
|
||||
for document in sorted_documents:
|
||||
if document.title in seen_titles or document.url in seen_urls:
|
||||
continue
|
||||
|
@ -78,7 +78,6 @@ def index_pages(index_path, page_documents):
|
|||
seen_urls.add(document.url)
|
||||
seen_titles.add(document.title)
|
||||
indexer.store_in_page(page, new_documents)
|
||||
logger.debug(f"Wrote page {page} with {len(new_documents)} documents")
|
||||
|
||||
|
||||
def preprocess_documents(documents, index_path, nlp):
|
||||
|
|
Loading…
Add table
Reference in a new issue