Track curated items in the index

This commit is contained in:
Daoud Clarke 2023-04-09 06:26:23 +01:00
parent a87d3d6def
commit 00b5438492
5 changed files with 31 additions and 51 deletions

Binary file not shown.

View file

@ -77,18 +77,3 @@ def tokenize_document(url, title_cleaned, extract, score, nlp):
# print("High scoring", len(high_scoring_tokens), token_scores, doc)
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
return document
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
terms = Counter()
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
for page in pages:
for token in page.tokens:
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
terms.update([t.lower() for t in page.tokens])
term_df = pd.DataFrame({
'term': terms.keys(),
'count': terms.values(),
})
term_df.to_csv(terms_path)

View file

@ -7,7 +7,7 @@ import requests
from fastapi import APIRouter, Response
from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
from mwmbl.tokenizer import tokenize
@ -149,11 +149,6 @@ def create_router(index_path: str) -> APIRouter:
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
with TinyIndex(Document, index_path, 'w') as indexer:
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i)
for i, result in enumerate(curation.results)
]
query_string = parse_qs(curation.url)
if len(query_string) > 1:
raise ValueError(f"Should be one query string in the URL: {curation.url}")
@ -166,9 +161,14 @@ def create_router(index_path: str) -> APIRouter:
print("Query", query)
tokens = tokenize(query)
print("Tokens", tokens)
key = " ".join(tokens)
print("Key", key)
page_index = indexer.get_key_page_index(key)
term = " ".join(tokens)
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, DocumentState.CURATED.value)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
print("Page index", page_index)
print("Storing documents", documents)
indexer.store_in_page(page_index, documents)

View file

@ -1,10 +1,11 @@
import json
import os
from dataclasses import dataclass, asdict
from io import UnsupportedOperation, BytesIO
from dataclasses import dataclass, asdict, field
from enum import IntEnum
from io import UnsupportedOperation
from logging import getLogger
from mmap import mmap, PROT_READ, PROT_WRITE
from typing import TypeVar, Generic, Callable, List
from typing import TypeVar, Generic, Callable, List, Optional
import mmh3
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
@ -20,7 +21,17 @@ logger = getLogger(__name__)
def astuple(dc):
return tuple(dc.__dict__.values())
"""
Convert a type to a tuple - values at the end that are None can be truncated.
"""
value = tuple(dc.__dict__.values())
while value[-1] is None:
value = value[:-1]
return value
class DocumentState(IntEnum):
CURATED = 1
@dataclass
@ -29,11 +40,13 @@ class Document:
url: str
extract: str
score: float
term: Optional[str] = None
state: Optional[int] = None
@dataclass
class TokenizedDocument(Document):
tokens: List[str]
tokens: List[str] = field(default_factory=list)
T = TypeVar('T')
@ -175,23 +188,6 @@ class TinyIndex(Generic[T]):
# logger.debug(f"Decompressed data: {decompressed_data}")
return json.loads(decompressed_data.decode('utf8'))
def index(self, key: str, value: T):
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self.get_key_page_index(key)
try:
self.add_to_page(page_index, [value])
except PageError:
pass
def add_to_page(self, page_index: int, values: list[T]):
current_page = self._get_page_tuples(page_index)
if current_page is None:
current_page = []
value_tuples = [astuple(value) for value in values]
current_page += value_tuples
self._write_page(current_page, page_index)
def store_in_page(self, page_index: int, values: list[T]):
value_tuples = [astuple(value) for value in values]
self._write_page(value_tuples, page_index)

View file

@ -10,7 +10,7 @@ from mwmbl.platform.user import MAX_CURATED_SCORE
from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
logger = getLogger(__name__)
@ -162,10 +162,9 @@ class Ranker:
# Check for curation
curation_term = " ".join(terms)
curation_items = self.tiny_index.retrieve(curation_term)
# TODO: find a better way to track curated pages
if curation_items[0].score == MAX_CURATED_SCORE:
return curation_items, terms, completions
curated_items = [d for d in curation_items if d.state == DocumentState.CURATED and d.term == curation_term]
if len(curated_items) > 0:
return curated_items, terms, completions
bigrams = set(get_bigrams(len(terms), terms))