Store curated pages in the index
This commit is contained in:
parent
61cdd4dd71
commit
a87d3d6def
4 changed files with 58 additions and 10 deletions
Binary file not shown.
|
@ -53,6 +53,7 @@ def run():
|
|||
|
||||
new_item_queue = Queue()
|
||||
queued_batches = Queue()
|
||||
# curation_queue = Queue()
|
||||
|
||||
if args.background:
|
||||
Process(target=background.run, args=(args.data,)).start()
|
||||
|
@ -84,7 +85,7 @@ def run():
|
|||
crawler_router = crawler.get_router(batch_cache, queued_batches)
|
||||
app.include_router(crawler_router)
|
||||
|
||||
user_router = user.create_router()
|
||||
user_router = user.create_router(index_path)
|
||||
app.include_router(user_router)
|
||||
|
||||
# Initialize uvicorn server using global app instance and server config params
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
import json
|
||||
import os
|
||||
from typing import TypeVar, Generic
|
||||
from urllib.parse import urljoin
|
||||
from urllib.parse import urljoin, parse_qs
|
||||
|
||||
import requests
|
||||
from fastapi import APIRouter, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
LEMMY_URL = os.environ["LEMMY_URL"]
|
||||
RESULT_URL = "https://mwmbl.org/?q="
|
||||
MAX_CURATED_SCORE = 1_111_111.0
|
||||
|
||||
|
||||
class Register(BaseModel):
|
||||
|
@ -62,10 +65,12 @@ T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
|
|||
class Curation(BaseModel, Generic[T]):
|
||||
auth: str
|
||||
curation_id: int
|
||||
url: str
|
||||
results: list[Result]
|
||||
curation: T
|
||||
|
||||
|
||||
def create_router() -> APIRouter:
|
||||
def create_router(index_path: str) -> APIRouter:
|
||||
router = APIRouter(prefix="/user", tags=["user"])
|
||||
|
||||
community_id = get_community_id()
|
||||
|
@ -114,22 +119,25 @@ def create_router() -> APIRouter:
|
|||
|
||||
@router.post("/curation/move")
|
||||
def user_move_result(curate_move: Curation[CurateMove]):
|
||||
return _create_comment("curate_move", curate_move)
|
||||
return _curate("curate_move", curate_move)
|
||||
|
||||
@router.post("/curation/delete")
|
||||
def user_delete_result(curate_delete: Curation[CurateDelete]):
|
||||
return _create_comment("curate_delete", curate_delete)
|
||||
return _curate("curate_delete", curate_delete)
|
||||
|
||||
@router.post("/curation/add")
|
||||
def user_add_result(curate_add: Curation[CurateAdd]):
|
||||
return _create_comment("curate_add", curate_add)
|
||||
return _curate("curate_add", curate_add)
|
||||
|
||||
@router.post("/curation/validate")
|
||||
def user_add_result(curate_validate: Curation[CurateValidate]):
|
||||
return _create_comment("curate_validate", curate_validate)
|
||||
return _curate("curate_validate", curate_validate)
|
||||
|
||||
def _create_comment(curation_type: str, curation: Curation):
|
||||
content = json.dumps({curation_type: curation.curation.dict()}, indent=2)
|
||||
def _curate(curation_type: str, curation: Curation):
|
||||
content = json.dumps({
|
||||
"curation_type": curation_type,
|
||||
"curation": curation.curation.dict(),
|
||||
}, indent=2)
|
||||
create_comment = {
|
||||
"auth": curation.auth,
|
||||
"content": json.dumps(content, indent=2),
|
||||
|
@ -139,6 +147,32 @@ def create_router() -> APIRouter:
|
|||
"post_id": curation.curation_id,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
||||
queries = next(iter(query_string.values()))
|
||||
if len(queries) > 1:
|
||||
raise ValueError(f"Should be one query value in the URL: {curation.url}")
|
||||
|
||||
query = queries[0]
|
||||
print("Query", query)
|
||||
tokens = tokenize(query)
|
||||
print("Tokens", tokens)
|
||||
key = " ".join(tokens)
|
||||
print("Key", key)
|
||||
page_index = indexer.get_key_page_index(key)
|
||||
print("Page index", page_index)
|
||||
print("Storing documents", documents)
|
||||
indexer.store_in_page(page_index, documents)
|
||||
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
return router
|
||||
|
|
|
@ -6,6 +6,7 @@ from operator import itemgetter
|
|||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||
from mwmbl.platform.user import MAX_CURATED_SCORE
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
|
@ -149,6 +150,7 @@ class Ranker:
|
|||
|
||||
def get_results(self, q):
|
||||
terms = tokenize(q)
|
||||
|
||||
is_complete = q.endswith(' ')
|
||||
if len(terms) > 0 and not is_complete:
|
||||
completions = self.completer.complete(terms[-1])
|
||||
|
@ -157,12 +159,23 @@ class Ranker:
|
|||
completions = []
|
||||
retrieval_terms = set(terms)
|
||||
|
||||
# Check for curation
|
||||
curation_term = " ".join(terms)
|
||||
curation_items = self.tiny_index.retrieve(curation_term)
|
||||
|
||||
# TODO: find a better way to track curated pages
|
||||
if curation_items[0].score == MAX_CURATED_SCORE:
|
||||
return curation_items, terms, completions
|
||||
|
||||
bigrams = set(get_bigrams(len(terms), terms))
|
||||
|
||||
pages = []
|
||||
seen_items = set()
|
||||
for term in retrieval_terms | bigrams:
|
||||
items = self.tiny_index.retrieve(term)
|
||||
if term == curation_term:
|
||||
items = curation_items
|
||||
else:
|
||||
items = self.tiny_index.retrieve(term)
|
||||
if items is not None:
|
||||
for item in items:
|
||||
# if term in item.title.lower() or term in item.extract.lower():
|
||||
|
|
Loading…
Reference in a new issue