diff --git a/devdata/index-v2.tinysearch b/devdata/index-v2.tinysearch index 368dcc5..f922368 100644 Binary files a/devdata/index-v2.tinysearch and b/devdata/index-v2.tinysearch differ diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 1dfa3af..4edcb8a 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -77,18 +77,3 @@ def tokenize_document(url, title_cleaned, extract, score, nlp): # print("High scoring", len(high_scoring_tokens), token_scores, doc) document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score) return document - - -def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path): - terms = Counter() - pages = get_pages(nlp, titles_urls_and_extracts, link_counts) - for page in pages: - for token in page.tokens: - indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score)) - terms.update([t.lower() for t in page.tokens]) - - term_df = pd.DataFrame({ - 'term': terms.keys(), - 'count': terms.values(), - }) - term_df.to_csv(terms_path) diff --git a/mwmbl/platform/user.py b/mwmbl/platform/user.py index 175897a..4cff6e1 100644 --- a/mwmbl/platform/user.py +++ b/mwmbl/platform/user.py @@ -7,7 +7,7 @@ import requests from fastapi import APIRouter, Response from pydantic import BaseModel -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState from mwmbl.tokenizer import tokenize @@ -149,11 +149,6 @@ def create_router(index_path: str) -> APIRouter: request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment) with TinyIndex(Document, index_path, 'w') as indexer: - documents = [ - Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i) - for i, result in enumerate(curation.results) - ] - query_string = parse_qs(curation.url) if len(query_string) > 1: raise ValueError(f"Should be one query string in the URL: {curation.url}") @@ -166,9 +161,14 @@ def create_router(index_path: str) -> APIRouter: print("Query", query) tokens = tokenize(query) print("Tokens", tokens) - key = " ".join(tokens) - print("Key", key) - page_index = indexer.get_key_page_index(key) + term = " ".join(tokens) + print("Key", term) + + documents = [ + Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, DocumentState.CURATED.value) + for i, result in enumerate(curation.results) + ] + page_index = indexer.get_key_page_index(term) print("Page index", page_index) print("Storing documents", documents) indexer.store_in_page(page_index, documents) diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 3e416b6..4a61972 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -1,10 +1,11 @@ import json import os -from dataclasses import dataclass, asdict -from io import UnsupportedOperation, BytesIO +from dataclasses import dataclass, asdict, field +from enum import IntEnum +from io import UnsupportedOperation from logging import getLogger from mmap import mmap, PROT_READ, PROT_WRITE -from typing import TypeVar, Generic, Callable, List +from typing import TypeVar, Generic, Callable, List, Optional import mmh3 from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError @@ -20,7 +21,17 @@ logger = getLogger(__name__) def astuple(dc): - return tuple(dc.__dict__.values()) + """ + Convert a type to a tuple - values at the end that are None can be truncated. + """ + value = tuple(dc.__dict__.values()) + while value[-1] is None: + value = value[:-1] + return value + + +class DocumentState(IntEnum): + CURATED = 1 @dataclass @@ -29,11 +40,13 @@ class Document: url: str extract: str score: float + term: Optional[str] = None + state: Optional[int] = None @dataclass class TokenizedDocument(Document): - tokens: List[str] + tokens: List[str] = field(default_factory=list) T = TypeVar('T') @@ -175,23 +188,6 @@ class TinyIndex(Generic[T]): # logger.debug(f"Decompressed data: {decompressed_data}") return json.loads(decompressed_data.decode('utf8')) - def index(self, key: str, value: T): - assert type(value) == self.item_factory, f"Can only index the specified type" \ - f" ({self.item_factory.__name__})" - page_index = self.get_key_page_index(key) - try: - self.add_to_page(page_index, [value]) - except PageError: - pass - - def add_to_page(self, page_index: int, values: list[T]): - current_page = self._get_page_tuples(page_index) - if current_page is None: - current_page = [] - value_tuples = [astuple(value) for value in values] - current_page += value_tuples - self._write_page(current_page, page_index) - def store_in_page(self, page_index: int, values: list[T]): value_tuples = [astuple(value) for value in values] self._write_page(value_tuples, page_index) diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index fba9a19..18d7cf8 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -10,7 +10,7 @@ from mwmbl.platform.user import MAX_CURATED_SCORE from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tinysearchengine.completer import Completer from mwmbl.hn_top_domains_filtered import DOMAINS -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState logger = getLogger(__name__) @@ -162,10 +162,9 @@ class Ranker: # Check for curation curation_term = " ".join(terms) curation_items = self.tiny_index.retrieve(curation_term) - - # TODO: find a better way to track curated pages - if curation_items[0].score == MAX_CURATED_SCORE: - return curation_items, terms, completions + curated_items = [d for d in curation_items if d.state == DocumentState.CURATED and d.term == curation_term] + if len(curated_items) > 0: + return curated_items, terms, completions bigrams = set(get_bigrams(len(terms), terms))