diff --git a/devdata/index-v2.tinysearch b/devdata/index-v2.tinysearch index 95ec873..368dcc5 100644 Binary files a/devdata/index-v2.tinysearch and b/devdata/index-v2.tinysearch differ diff --git a/mwmbl/main.py b/mwmbl/main.py index be60cb3..4d7b499 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -53,6 +53,7 @@ def run(): new_item_queue = Queue() queued_batches = Queue() + # curation_queue = Queue() if args.background: Process(target=background.run, args=(args.data,)).start() @@ -84,7 +85,7 @@ def run(): crawler_router = crawler.get_router(batch_cache, queued_batches) app.include_router(crawler_router) - user_router = user.create_router() + user_router = user.create_router(index_path) app.include_router(user_router) # Initialize uvicorn server using global app instance and server config params diff --git a/mwmbl/platform/user.py b/mwmbl/platform/user.py index ed9fd85..175897a 100644 --- a/mwmbl/platform/user.py +++ b/mwmbl/platform/user.py @@ -1,16 +1,19 @@ import json import os from typing import TypeVar, Generic -from urllib.parse import urljoin +from urllib.parse import urljoin, parse_qs import requests from fastapi import APIRouter, Response from pydantic import BaseModel +from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tokenizer import tokenize + LEMMY_URL = os.environ["LEMMY_URL"] RESULT_URL = "https://mwmbl.org/?q=" +MAX_CURATED_SCORE = 1_111_111.0 class Register(BaseModel): @@ -62,10 +65,12 @@ T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate) class Curation(BaseModel, Generic[T]): auth: str curation_id: int + url: str + results: list[Result] curation: T -def create_router() -> APIRouter: +def create_router(index_path: str) -> APIRouter: router = APIRouter(prefix="/user", tags=["user"]) community_id = get_community_id() @@ -114,22 +119,25 @@ def create_router() -> APIRouter: @router.post("/curation/move") def user_move_result(curate_move: Curation[CurateMove]): - return _create_comment("curate_move", curate_move) + return _curate("curate_move", curate_move) @router.post("/curation/delete") def user_delete_result(curate_delete: Curation[CurateDelete]): - return _create_comment("curate_delete", curate_delete) + return _curate("curate_delete", curate_delete) @router.post("/curation/add") def user_add_result(curate_add: Curation[CurateAdd]): - return _create_comment("curate_add", curate_add) + return _curate("curate_add", curate_add) @router.post("/curation/validate") def user_add_result(curate_validate: Curation[CurateValidate]): - return _create_comment("curate_validate", curate_validate) + return _curate("curate_validate", curate_validate) - def _create_comment(curation_type: str, curation: Curation): - content = json.dumps({curation_type: curation.curation.dict()}, indent=2) + def _curate(curation_type: str, curation: Curation): + content = json.dumps({ + "curation_type": curation_type, + "curation": curation.curation.dict(), + }, indent=2) create_comment = { "auth": curation.auth, "content": json.dumps(content, indent=2), @@ -139,6 +147,32 @@ def create_router() -> APIRouter: "post_id": curation.curation_id, } request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment) + + with TinyIndex(Document, index_path, 'w') as indexer: + documents = [ + Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i) + for i, result in enumerate(curation.results) + ] + + query_string = parse_qs(curation.url) + if len(query_string) > 1: + raise ValueError(f"Should be one query string in the URL: {curation.url}") + + queries = next(iter(query_string.values())) + if len(queries) > 1: + raise ValueError(f"Should be one query value in the URL: {curation.url}") + + query = queries[0] + print("Query", query) + tokens = tokenize(query) + print("Tokens", tokens) + key = " ".join(tokens) + print("Key", key) + page_index = indexer.get_key_page_index(key) + print("Page index", page_index) + print("Storing documents", documents) + indexer.store_in_page(page_index, documents) + return Response(content=request.content, status_code=request.status_code, media_type="text/json") return router diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 84e1588..fba9a19 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -6,6 +6,7 @@ from operator import itemgetter from urllib.parse import urlparse from mwmbl.format import format_result_with_pattern, get_query_regex +from mwmbl.platform.user import MAX_CURATED_SCORE from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tinysearchengine.completer import Completer from mwmbl.hn_top_domains_filtered import DOMAINS @@ -149,6 +150,7 @@ class Ranker: def get_results(self, q): terms = tokenize(q) + is_complete = q.endswith(' ') if len(terms) > 0 and not is_complete: completions = self.completer.complete(terms[-1]) @@ -157,12 +159,23 @@ class Ranker: completions = [] retrieval_terms = set(terms) + # Check for curation + curation_term = " ".join(terms) + curation_items = self.tiny_index.retrieve(curation_term) + + # TODO: find a better way to track curated pages + if curation_items[0].score == MAX_CURATED_SCORE: + return curation_items, terms, completions + bigrams = set(get_bigrams(len(terms), terms)) pages = [] seen_items = set() for term in retrieval_terms | bigrams: - items = self.tiny_index.retrieve(term) + if term == curation_term: + items = curation_items + else: + items = self.tiny_index.retrieve(term) if items is not None: for item in items: # if term in item.title.lower() or term in item.extract.lower():