Track curated items in the index
This commit is contained in:
parent
a87d3d6def
commit
00b5438492
5 changed files with 31 additions and 51 deletions
Binary file not shown.
|
@ -77,18 +77,3 @@ def tokenize_document(url, title_cleaned, extract, score, nlp):
|
|||
# print("High scoring", len(high_scoring_tokens), token_scores, doc)
|
||||
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
|
||||
return document
|
||||
|
||||
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
|
||||
for page in pages:
|
||||
for token in page.tokens:
|
||||
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
|
||||
term_df = pd.DataFrame({
|
||||
'term': terms.keys(),
|
||||
'count': terms.values(),
|
||||
})
|
||||
term_df.to_csv(terms_path)
|
||||
|
|
|
@ -7,7 +7,7 @@ import requests
|
|||
from fastapi import APIRouter, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
|
@ -149,11 +149,6 @@ def create_router(index_path: str) -> APIRouter:
|
|||
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
@ -166,9 +161,14 @@ def create_router(index_path: str) -> APIRouter:
|
|||
print("Query", query)
|
||||
tokens = tokenize(query)
|
||||
print("Tokens", tokens)
|
||||
key = " ".join(tokens)
|
||||
print("Key", key)
|
||||
page_index = indexer.get_key_page_index(key)
|
||||
term = " ".join(tokens)
|
||||
print("Key", term)
|
||||
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, DocumentState.CURATED.value)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
page_index = indexer.get_key_page_index(term)
|
||||
print("Page index", page_index)
|
||||
print("Storing documents", documents)
|
||||
indexer.store_in_page(page_index, documents)
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from io import UnsupportedOperation, BytesIO
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from enum import IntEnum
|
||||
from io import UnsupportedOperation
|
||||
from logging import getLogger
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE
|
||||
from typing import TypeVar, Generic, Callable, List
|
||||
from typing import TypeVar, Generic, Callable, List, Optional
|
||||
|
||||
import mmh3
|
||||
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
||||
|
@ -20,7 +21,17 @@ logger = getLogger(__name__)
|
|||
|
||||
|
||||
def astuple(dc):
|
||||
return tuple(dc.__dict__.values())
|
||||
"""
|
||||
Convert a type to a tuple - values at the end that are None can be truncated.
|
||||
"""
|
||||
value = tuple(dc.__dict__.values())
|
||||
while value[-1] is None:
|
||||
value = value[:-1]
|
||||
return value
|
||||
|
||||
|
||||
class DocumentState(IntEnum):
|
||||
CURATED = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -29,11 +40,13 @@ class Document:
|
|||
url: str
|
||||
extract: str
|
||||
score: float
|
||||
term: Optional[str] = None
|
||||
state: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizedDocument(Document):
|
||||
tokens: List[str]
|
||||
tokens: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
|
@ -175,23 +188,6 @@ class TinyIndex(Generic[T]):
|
|||
# logger.debug(f"Decompressed data: {decompressed_data}")
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
f" ({self.item_factory.__name__})"
|
||||
page_index = self.get_key_page_index(key)
|
||||
try:
|
||||
self.add_to_page(page_index, [value])
|
||||
except PageError:
|
||||
pass
|
||||
|
||||
def add_to_page(self, page_index: int, values: list[T]):
|
||||
current_page = self._get_page_tuples(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
value_tuples = [astuple(value) for value in values]
|
||||
current_page += value_tuples
|
||||
self._write_page(current_page, page_index)
|
||||
|
||||
def store_in_page(self, page_index: int, values: list[T]):
|
||||
value_tuples = [astuple(value) for value in values]
|
||||
self._write_page(value_tuples, page_index)
|
||||
|
|
|
@ -10,7 +10,7 @@ from mwmbl.platform.user import MAX_CURATED_SCORE
|
|||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -162,10 +162,9 @@ class Ranker:
|
|||
# Check for curation
|
||||
curation_term = " ".join(terms)
|
||||
curation_items = self.tiny_index.retrieve(curation_term)
|
||||
|
||||
# TODO: find a better way to track curated pages
|
||||
if curation_items[0].score == MAX_CURATED_SCORE:
|
||||
return curation_items, terms, completions
|
||||
curated_items = [d for d in curation_items if d.state == DocumentState.CURATED and d.term == curation_term]
|
||||
if len(curated_items) > 0:
|
||||
return curated_items, terms, completions
|
||||
|
||||
bigrams = set(get_bigrams(len(terms), terms))
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue