From a87d3d6defb4c47dcbfefd7538399a040000291b Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 9 Apr 2023 05:31:23 +0100 Subject: [PATCH] Store curated pages in the index --- devdata/index-v2.tinysearch | Bin 10489856 -> 10489856 bytes mwmbl/main.py | 3 +- mwmbl/platform/user.py | 50 +++++++++++++++++++++++++++------ mwmbl/tinysearchengine/rank.py | 15 +++++++++- 4 files changed, 58 insertions(+), 10 deletions(-) diff --git a/devdata/index-v2.tinysearch b/devdata/index-v2.tinysearch index 95ec873d7657711dc18ba44e50bac3b7b101b14a..368dcc50569cfc5de08f35575200abc941bc9263 100644 GIT binary patch delta 2223 zcmWl|2{;q{9{}*V7P&?_OOCmZ*WGf;rjF07Xy&JSc+-sDc`(g9d1V7HESG`~kY42l`+DhF}E7 zupbC;08GFX%)lHhz!I#$8V-UD*n%C{g9A8%6F7ql90FHxgTvqs9^eUH;0;H>2aduq z@C85chX4qKAP9!zZ~{W$B!t2#I1OPC4iTM^d^mEEaLay-tRF^HW4^dErZV@Lq*Tgc zDZXPa*4ga4DtGZ)s;Rs2aitfI$}Jz4qz9!#LFz5$J?0x`lz`8tMSY424P4p#=WU0V zVRpN8C0?Ez^QTYyzX)}$?DvMht`$V4SOw{MmNMGrn#X*e5W+i#TDMc#CU?!&j9H|Hv-`iO{z#IZS6c{k zbD-%#+3o+^Yt}hs0kw#sY~(L50qWy^&h|gy8oBzrODBH=k)Q8yL56sh=CC zv`X&kEuEZ?`&rQJ+Oy#1ptk(z0c$`zE3RLTT=cAlNfF^s-zUn``i6^xO*=K}gXv3C zWWIZ_T|#L0?CBm|>@;ANyVD;7UEM%JFoAj$&TFmH`XX_;1--u8X`gBz^ zd`>!A>M*4p@sX)?G~aRfw!^ZKdm_`9&fyWb%`Z`YFHdO1o)mm>UnVA89dB-PE8WJE zXu+FEsw;_}I5$+=a>qpZa0hqcV;ol%S)r11$X_$5jnx}qy3J{#7CG;3#CR&i*|{_1 zW~LtNbm^swWTbI(QLK-UIKBqbPrhu-i=BPgVE=as!8MhO!SRdPdR-HofXjJ3sd+XT zME5r)cv?+{Ht1xj@51pYt77e}*(q%lM*)1au4YA(WL&;LNFQPILD!p!Pkkh^uSd^J zZ`mwiZ-d2waP3&DS3SOQ5j9IpS8^ z<5dX!pqy8&%zBzJ_mE?+!Ld9?viFo_J>{)^V|1bX5$n@h$?H;vB#{}u)5)NIvAnbE z^PapG8@GU2^NlM9vb85ughf8*DQBwjWyU|#G;oZaHH)>p`7Q!?W>B&dL&mEVwYx4y zM!S3CFKEbF;r}YXFsZ`VU976)T%qj8R-gD%yF|+{D34jS<%qE6<-gb3=XXmm4a2mW z=6drf?Q(a8s{|9Z1y*;G~6oWer|kN77m&7NS* z_r`Mqw|#~CY6}{PN&ZL8@Za`ke17~}#BnY9PM87K;oU-CHa%QavM5dUx_wVR_PTY_ zNd-BKdXBYEnAi6B`5nvlq9^bhWp8K3^u%qFzpGBOy93tyj*nNbxRXbQ^{tC)WU>lw zxT|NBXyDl&i3?^@(k?3bmbtftB2^X`ek}p1!4K$b3Zz$b24~Uoj8R04W;?a2P;plB zH-DDxlQOzrPG$@}R%V~-P)TCvG%Uuu*v+L5$CdO}5l~qOTgisV9U; zxYlQb{Ui69>*T#axv*CPry%=E{f^J_!vXEL_s`YsmACz%TamJ*?&WxMuu9y{&U%}z zv*3|W-91*LSeCVwJsE7W5wEBdCRp{{)<^L7yrZ@uP9l|=L-vaiQD3gYHe7YYr9N_* znC2h0RBV#IJ~dWCxw0eoqFlPv0Ap>vQ!;Qw={2_8&88zUgPRe1_yMuEJ^|<7e@yTy zN2dL>U&HAP#{u}+w=#$pak6DI| zy=h&jwZrM%UF68G##=Z)e?l#xS|ICE;Uf1_3m4%zc8!&5#EbQj`v39oB&wL?>o=05 z(!@O*iVmI$W)(!<*`(0T8HBZVd*9^?3Qp_CJvN57HmKhESfWmBkn@D_^C`8EE9P8g zQ;{orlc$G-y;tteU3qIazF)^_;k>9yfJ?&f$439uP+o+M9+dXPY*IM*36_abo6N~@ z@A<=4mF#@^#vzsQ%lvJZ7|RtFy%L6r{?yAeRuyCA_6%*k(|S@xT@okGcnLOH-jzK+ zWAdRpnHe1&Zrdpv=d@IlnKIP-J<>XFF^(^rPazVG zLZeY48iU57aVQBrgT|w0(FF7ydLB(glh9-|1-*c#q8HIWQ8Jo_QqXiX0|hh_%|f%$ z95ffrL-WxBv=F_7UPg=1D`+u#6}^UDM@!Hf=uNZ~EkkdiVH9hypPn4kUmikOI;`2Al-4;1rMp@&E-CfFe)=%0LCE0yH=c)POp` z01bczXMiTq0%yTFpbgFg9iR*JfIct)h5!eQzy)9oOn@mc1LnX2SOPq-0@lC=5P&VP z0~dikZ~&KpBX9!Fzy-JhH*gua0}tQ{ynr|G0lwf0@N4#GMvx>0x7-+TCVx z5p8iN?~m^GcDxAMQ?M=MaXrvFw5N;hxK^JP{-eCjToI#k?SEQ8EMqQX9OGo)_l1i( zx#CunHLh)Z*FFjxRvjKS6eUjyNPUlyidx3*eqJ)@m`EL9Si&iUC|lVZDBR=o&=a

+_wS*L1&$$80QeO z_rjD`TE?rPD-9a8-|wEAu{-KPsi&2(rBC>|i@w=WA+#$ks#YchKaz_L)~}kI{_||} zyTENgn3=Zkcflu|iGx6qrdPv64AWQPFkd}BJpMW1biUNQ|WDh zl~|scW&un2xVt1bcNH3={fOHekCXEXAGHkwAxviY}17=sryqNc->e2 z!+6buffmu7^04;l_K8Z$rb=j1=fjj3AJWmu{&?%NmGEn1rC3y?Kkn>a2F)u;1(+mQ z8nFwJ^g5`BaMQ=f(~j5-VO82=V;GA*56vnxdeS6j{DhtQ1)iMaP~hLZ%E0pTwIxsEaCMKKT$lvO7fD+z)NAc;1rI&kBk+SX|a31;93pS z95pG22O~zFi_*p$g|_UFG{yQ<#0)R_G}CoghcNfpgoo zlClNaWAu*sL&8V?+bV^pG_!MNZQkp!oPS+oRGARYxWaW!gOjUo7u4peJ#*ho8`fJKx8g9epE8_$SNO5Q!-$C0Js00h$k`4r zGN_r|@|G#AB<8WkWpcn&acZ0ohFBvd*2uMU(iw~ujrFp5%jl6f#@2fo7P@WCbhzBt z%4z(2<2lJadLx-W=5Hq{Bu7K1uo||LJaK9Vg1f7cMS>~WUPXQ?S-Av+`PZDXaqYAp z7JBZCrhgX+BlLr-8MzORs5tBsu4inYY}fgCDoQ=j$0}|5?aH1s5w#gG?_he-eMwmn zhUo0rOqBVHG(A|l$yxc7eaUj)BoBDP1ez-WU$D;nix~Ixnf_Y8PSM~@K+@6XFWec72Thar`m?lYr}n> z$~BH-Qhr!3VgIWR7=7a5z1bdm8Hu0j>JFYt{`2#rty$$Qf)2AT2WRVFodMQf&#Q;& z4jb32yKy1sRa=iwvJSR&y8mqr%D??*(W%5{tyNSib`!6-)<^M6y_(+BmOIQ8|r z4v`{jo2CZJ<8WFJqbWIibYuy_~7IwuZ(s&0tb# zdM!81J_-El^$C@|C|*<1*^Z~eKKESGD?A}^;Rx$lEfM~yu86gK9E~^JxuP-2raw`X zlX%FvQf}^pI=Yo0suAYr*tJLtcFgJD@F!{~2gL9a3yj)8%APbd>NO` zxJKL+u0^e~5-HLg`;ghj>Fnvwul>_|DN8L#Tl(P8^s!bo_Z{m2PP(osm-Az#hmR$KLbM_MbiKJZJaRi zp;6-b4bE!vCjN*L=Pthkzw7&Cg^~x)zZ!t^J!wjbAh3}TY;dq*!8gQne91_kT~-Do zEp4iQkMi_%pi@ZK_puqTI_LGog%9H??Isj=rV)C{S&^HLlb0Td?E54K;B>s4j!-TI z#RuAZi|;~lzbRYJ?HeKv5%+SqHzbnaWwxsg(Hmzd>#M11$Gp6c&1S4yAB&KA5jc1< zu!twXzOb0|R=CYJo3&!^LjLq)))zc^hw1&8s;-(47?gqa4-mZT|u50Es?zs!>vKGSGm+LOv9z)uqQ9|iY$I5K0+ht|0 z=XdX~Q5Bo#z^aKMuiA{M`ykkhSnp>Iu(`rlXUL0$2i4Nm`4Y33Q@|u|T(^oB)(qEv`KgGq{;le4S@zJoMGx+O3Y^KV#qVD)cp0LfdP$luwY(Urn7a z^1J#v&ymql+9A1@A&{s3lyoX0@IBk?Wbf+<94Ft{Zlz3 znZ}a{l`*ECD9PJ-nfuzxcc>favir#@ZEo(q-GAb!+}wd>Q5i{hx!AT2(`@d`YO3mq z`O!!^t)_0qzD)H8S6tn95@ zHNcXZqc7@?9&gYoOIhg-XSZTFdy7vjP3|w)`1xS>Jc9PepDW!E8!#VK5zQb?%u0DY zXmn30xtV;@WMsBQEHIAm-K_a59@*XNO_p7xw%RC>Vu73QKbKfa@~z~$MP(X)J;uU$ zLPt04zPP^6y+lq0-!`-5-_J9HHH~H?JXLS`r$qBcxVKO-k!VEfI6=Msv)IU#t8cH? zkvB>y+`Z9%S*i4S6Z!JZXKepOWW84kK0nCT8O9RAQ?pWG|#P zDI-eWCi0l>_F|B{lV6Vz-C76cEkw1@xXu)R5oUO+KRmIrb5{USq4vx2fz z&BI3X(+N!K&oN+0qIeD59oW;f_ab_4nrIgx!(a`y6}f!AsK0aQyo(yDYer+-+#q<{X|1u* z1!fsgq-4$g_>p;iUE#G@^isN9l|uP7A&LW9O5q7A6lOeOO}u2vwMt2ZXOf~z|k{_S40S-PC5wyn4$`&-fB z=(1bR#!4bgNK53H33`d(W_k0M>%WNS4GhEOZr1XSie!CfWT*FTO?<&3X^D2IOO<;!M=%{WWEb2ItS)x?*Z#rlZ7J35chFn- z`otANh{PA65SRV^D`bAAKz*Pz#-QP3OEXy{GoEocn%Hk1U7h2DYQ zg~mYvlnjlBCO{LRNzi-HWM~RB6`BT3hh{)Cp%mzSXcjaZngh*+=0WqJ1<(i3LTC}R MxRqJ5%05L^cX#fBK diff --git a/mwmbl/main.py b/mwmbl/main.py index be60cb3..4d7b499 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -53,6 +53,7 @@ def run(): new_item_queue = Queue() queued_batches = Queue() + # curation_queue = Queue() if args.background: Process(target=background.run, args=(args.data,)).start() @@ -84,7 +85,7 @@ def run(): crawler_router = crawler.get_router(batch_cache, queued_batches) app.include_router(crawler_router) - user_router = user.create_router() + user_router = user.create_router(index_path) app.include_router(user_router) # Initialize uvicorn server using global app instance and server config params diff --git a/mwmbl/platform/user.py b/mwmbl/platform/user.py index ed9fd85..175897a 100644 --- a/mwmbl/platform/user.py +++ b/mwmbl/platform/user.py @@ -1,16 +1,19 @@ import json import os from typing import TypeVar, Generic -from urllib.parse import urljoin +from urllib.parse import urljoin, parse_qs import requests from fastapi import APIRouter, Response from pydantic import BaseModel +from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tokenizer import tokenize + LEMMY_URL = os.environ["LEMMY_URL"] RESULT_URL = "https://mwmbl.org/?q=" +MAX_CURATED_SCORE = 1_111_111.0 class Register(BaseModel): @@ -62,10 +65,12 @@ T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate) class Curation(BaseModel, Generic[T]): auth: str curation_id: int + url: str + results: list[Result] curation: T -def create_router() -> APIRouter: +def create_router(index_path: str) -> APIRouter: router = APIRouter(prefix="/user", tags=["user"]) community_id = get_community_id() @@ -114,22 +119,25 @@ def create_router() -> APIRouter: @router.post("/curation/move") def user_move_result(curate_move: Curation[CurateMove]): - return _create_comment("curate_move", curate_move) + return _curate("curate_move", curate_move) @router.post("/curation/delete") def user_delete_result(curate_delete: Curation[CurateDelete]): - return _create_comment("curate_delete", curate_delete) + return _curate("curate_delete", curate_delete) @router.post("/curation/add") def user_add_result(curate_add: Curation[CurateAdd]): - return _create_comment("curate_add", curate_add) + return _curate("curate_add", curate_add) @router.post("/curation/validate") def user_add_result(curate_validate: Curation[CurateValidate]): - return _create_comment("curate_validate", curate_validate) + return _curate("curate_validate", curate_validate) - def _create_comment(curation_type: str, curation: Curation): - content = json.dumps({curation_type: curation.curation.dict()}, indent=2) + def _curate(curation_type: str, curation: Curation): + content = json.dumps({ + "curation_type": curation_type, + "curation": curation.curation.dict(), + }, indent=2) create_comment = { "auth": curation.auth, "content": json.dumps(content, indent=2), @@ -139,6 +147,32 @@ def create_router() -> APIRouter: "post_id": curation.curation_id, } request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment) + + with TinyIndex(Document, index_path, 'w') as indexer: + documents = [ + Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i) + for i, result in enumerate(curation.results) + ] + + query_string = parse_qs(curation.url) + if len(query_string) > 1: + raise ValueError(f"Should be one query string in the URL: {curation.url}") + + queries = next(iter(query_string.values())) + if len(queries) > 1: + raise ValueError(f"Should be one query value in the URL: {curation.url}") + + query = queries[0] + print("Query", query) + tokens = tokenize(query) + print("Tokens", tokens) + key = " ".join(tokens) + print("Key", key) + page_index = indexer.get_key_page_index(key) + print("Page index", page_index) + print("Storing documents", documents) + indexer.store_in_page(page_index, documents) + return Response(content=request.content, status_code=request.status_code, media_type="text/json") return router diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 84e1588..fba9a19 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -6,6 +6,7 @@ from operator import itemgetter from urllib.parse import urlparse from mwmbl.format import format_result_with_pattern, get_query_regex +from mwmbl.platform.user import MAX_CURATED_SCORE from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tinysearchengine.completer import Completer from mwmbl.hn_top_domains_filtered import DOMAINS @@ -149,6 +150,7 @@ class Ranker: def get_results(self, q): terms = tokenize(q) + is_complete = q.endswith(' ') if len(terms) > 0 and not is_complete: completions = self.completer.complete(terms[-1]) @@ -157,12 +159,23 @@ class Ranker: completions = [] retrieval_terms = set(terms) + # Check for curation + curation_term = " ".join(terms) + curation_items = self.tiny_index.retrieve(curation_term) + + # TODO: find a better way to track curated pages + if curation_items[0].score == MAX_CURATED_SCORE: + return curation_items, terms, completions + bigrams = set(get_bigrams(len(terms), terms)) pages = [] seen_items = set() for term in retrieval_terms | bigrams: - items = self.tiny_index.retrieve(term) + if term == curation_term: + items = curation_items + else: + items = self.tiny_index.retrieve(term) if items is not None: for item in items: # if term in item.title.lower() or term in item.extract.lower():