浏览代码

Track curated items in the index

Daoud Clarke 2 年之前
父节点
当前提交
00b5438492
共有 5 个文件被更改,包括 31 次插入51 次删除
  1. 二进制
      devdata/index-v2.tinysearch
  2. 0 15
      mwmbl/indexer/index.py
  3. 9 9
      mwmbl/platform/user.py
  4. 18 22
      mwmbl/tinysearchengine/indexer.py
  5. 4 5
      mwmbl/tinysearchengine/rank.py

二进制
devdata/index-v2.tinysearch


+ 0 - 15
mwmbl/indexer/index.py

@@ -77,18 +77,3 @@ def tokenize_document(url, title_cleaned, extract, score, nlp):
     # print("High scoring", len(high_scoring_tokens), token_scores, doc)
     document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
     return document
-
-
-def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
-    terms = Counter()
-    pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
-    for page in pages:
-        for token in page.tokens:
-            indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
-        terms.update([t.lower() for t in page.tokens])
-
-    term_df = pd.DataFrame({
-        'term': terms.keys(),
-        'count': terms.values(),
-    })
-    term_df.to_csv(terms_path)

+ 9 - 9
mwmbl/platform/user.py

@@ -7,7 +7,7 @@ import requests
 from fastapi import APIRouter, Response
 from pydantic import BaseModel
 
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
 from mwmbl.tokenizer import tokenize
 
 
@@ -149,11 +149,6 @@ def create_router(index_path: str) -> APIRouter:
         request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
 
         with TinyIndex(Document, index_path, 'w') as indexer:
-            documents = [
-                Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i)
-                for i, result in enumerate(curation.results)
-            ]
-
             query_string = parse_qs(curation.url)
             if len(query_string) > 1:
                 raise ValueError(f"Should be one query string in the URL: {curation.url}")
@@ -166,9 +161,14 @@ def create_router(index_path: str) -> APIRouter:
             print("Query", query)
             tokens = tokenize(query)
             print("Tokens", tokens)
-            key = " ".join(tokens)
-            print("Key", key)
-            page_index = indexer.get_key_page_index(key)
+            term = " ".join(tokens)
+            print("Key", term)
+
+            documents = [
+                Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, DocumentState.CURATED.value)
+                for i, result in enumerate(curation.results)
+            ]
+            page_index = indexer.get_key_page_index(term)
             print("Page index", page_index)
             print("Storing documents", documents)
             indexer.store_in_page(page_index, documents)

+ 18 - 22
mwmbl/tinysearchengine/indexer.py

@@ -1,10 +1,11 @@
 import json
 import os
-from dataclasses import dataclass, asdict
-from io import UnsupportedOperation, BytesIO
+from dataclasses import dataclass, asdict, field
+from enum import IntEnum
+from io import UnsupportedOperation
 from logging import getLogger
 from mmap import mmap, PROT_READ, PROT_WRITE
-from typing import TypeVar, Generic, Callable, List
+from typing import TypeVar, Generic, Callable, List, Optional
 
 import mmh3
 from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
@@ -20,7 +21,17 @@ logger = getLogger(__name__)
 
 
 def astuple(dc):
-    return tuple(dc.__dict__.values())
+    """
+    Convert a type to a tuple - values at the end that are None can be truncated.
+    """
+    value = tuple(dc.__dict__.values())
+    while value[-1] is None:
+        value = value[:-1]
+    return value
+
+
+class DocumentState(IntEnum):
+    CURATED = 1
 
 
 @dataclass
@@ -29,11 +40,13 @@ class Document:
     url: str
     extract: str
     score: float
+    term: Optional[str] = None
+    state: Optional[int] = None
 
 
 @dataclass
 class TokenizedDocument(Document):
-    tokens: List[str]
+    tokens: List[str] = field(default_factory=list)
 
 
 T = TypeVar('T')
@@ -175,23 +188,6 @@ class TinyIndex(Generic[T]):
         # logger.debug(f"Decompressed data: {decompressed_data}")
         return json.loads(decompressed_data.decode('utf8'))
 
-    def index(self, key: str, value: T):
-        assert type(value) == self.item_factory, f"Can only index the specified type" \
-                                              f" ({self.item_factory.__name__})"
-        page_index = self.get_key_page_index(key)
-        try:
-            self.add_to_page(page_index, [value])
-        except PageError:
-            pass
-
-    def add_to_page(self, page_index: int, values: list[T]):
-        current_page = self._get_page_tuples(page_index)
-        if current_page is None:
-            current_page = []
-        value_tuples = [astuple(value) for value in values]
-        current_page += value_tuples
-        self._write_page(current_page, page_index)
-
     def store_in_page(self, page_index: int, values: list[T]):
         value_tuples = [astuple(value) for value in values]
         self._write_page(value_tuples, page_index)

+ 4 - 5
mwmbl/tinysearchengine/rank.py

@@ -10,7 +10,7 @@ from mwmbl.platform.user import MAX_CURATED_SCORE
 from mwmbl.tokenizer import tokenize, get_bigrams
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.hn_top_domains_filtered import DOMAINS
-from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
 
 logger = getLogger(__name__)
 
@@ -162,10 +162,9 @@ class Ranker:
         # Check for curation
         curation_term = " ".join(terms)
         curation_items = self.tiny_index.retrieve(curation_term)
-
-        # TODO: find a better way to track curated pages
-        if curation_items[0].score == MAX_CURATED_SCORE:
-            return curation_items, terms, completions
+        curated_items = [d for d in curation_items if d.state == DocumentState.CURATED and d.term == curation_term]
+        if len(curated_items) > 0:
+            return curated_items, terms, completions
 
         bigrams = set(get_bigrams(len(terms), terms))