Merge pull request #100 from mwmbl/user-registration

User registration
This commit is contained in:
Daoud Clarke 2023-04-30 20:31:09 +01:00 committed by GitHub
commit 60980a6bc7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 1013 additions and 636 deletions

Binary file not shown.

View file

@ -7,12 +7,17 @@ from typing import Union
from uuid import uuid4
import boto3
import justext
import requests
from fastapi import HTTPException, APIRouter
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
from mwmbl.database import Database
from mwmbl.format import format_result
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
from mwmbl.settings import (
@ -26,7 +31,8 @@ from mwmbl.settings import (
PUBLIC_URL_PREFIX,
PUBLIC_USER_ID_LENGTH,
FILE_NAME_SUFFIX,
DATE_REGEX)
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
from mwmbl.tinysearchengine.indexer import Document
from mwmbl.url_queue import URLQueue
@ -45,9 +51,54 @@ def upload(data: bytes, name: str):
last_batch = None
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
encoding=None, default_encoding=DEFAULT_ENCODING,
enc_errors=DEFAULT_ENC_ERRORS):
"""
Converts an HTML page into a list of classified paragraphs. Each paragraph
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
"""
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
titles = dom.xpath("//title")
title = titles[0].text if len(titles) > 0 else None
dom = preprocessor(dom)
paragraphs = ParagraphMaker.make_paragraphs(dom)
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
stopwords_low, stopwords_high, max_link_density, no_headings)
revise_paragraph_classification(paragraphs, max_heading_distance)
return paragraphs, title
def get_router(batch_cache: BatchCache, queued_batches: Queue):
router = APIRouter(prefix="/crawler", tags=["crawler"])
@router.on_event("startup")
async def on_startup():
with Database() as db:
url_db = URLDatabase(db.connection)
return url_db.create_tables()
@router.get('/fetch')
def fetch_url(url: str, query: str):
response = requests.get(url)
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
extract = ' '.join([p.text for p in good_paragraphs])
if len(extract) > NUM_EXTRACT_CHARS:
extract = extract[:NUM_EXTRACT_CHARS - 1] + ''
result = Document(title=title, url=url, extract=extract, score=0.0)
return format_result(result, query)
@router.post('/batches/')
def post_batch(batch: Batch):
if len(batch.items) > MAX_BATCH_SIZE:

40
mwmbl/format.py Normal file
View file

@ -0,0 +1,40 @@
import re
from mwmbl.tokenizer import tokenize
def format_result_with_pattern(pattern, result):
formatted_result = {}
for content_type, content in [('title', result.title), ('extract', result.extract)]:
matches = re.finditer(pattern, content, re.IGNORECASE)
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
content_result = []
for i in range(len(all_spans) - 1):
is_bold = i % 2 == 1
start = all_spans[i]
end = all_spans[i + 1]
content_result.append({'value': content[start:end], 'is_bold': is_bold})
formatted_result[content_type] = content_result
formatted_result['url'] = result.url
return formatted_result
def get_query_regex(terms, is_complete, is_url):
if not terms:
return ''
word_sep = r'\b' if is_url else ''
if is_complete:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
else:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
rf'{word_sep}{re.escape(terms[-1])}']
pattern = '|'.join(term_patterns)
return pattern
def format_result(result, query):
tokens = tokenize(query)
pattern = get_query_regex(tokens, True, False)
return format_result_with_pattern(pattern, result)

View file

@ -77,18 +77,3 @@ def tokenize_document(url, title_cleaned, extract, score, nlp):
# print("High scoring", len(high_scoring_tokens), token_scores, doc)
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
return document
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
terms = Counter()
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
for page in pages:
for token in page.tokens:
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
terms.update([t.lower() for t in page.tokens])
term_df = pd.DataFrame({
'term': terms.keys(),
'count': terms.values(),
})
term_df.to_csv(terms_path)

View file

@ -6,11 +6,13 @@ from pathlib import Path
import uvicorn
from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware
from mwmbl import background
from mwmbl.crawler import app as crawler
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.platform import user
from mwmbl.indexer.update_urls import update_urls_continuously
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
@ -51,6 +53,7 @@ def run():
new_item_queue = Queue()
queued_batches = Queue()
# curation_queue = Queue()
if args.background:
Process(target=background.run, args=(args.data,)).start()
@ -67,6 +70,14 @@ def run():
# Initialize FastApi instance
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
search_router = search.create_router(ranker)
app.include_router(search_router)
@ -74,6 +85,9 @@ def run():
crawler_router = crawler.get_router(batch_cache, queued_batches)
app.include_router(crawler_router)
user_router = user.create_router(index_path)
app.include_router(user_router)
# Initialize uvicorn server using global app instance and server config params
uvicorn.run(app, host="0.0.0.0", port=args.port)

View file

187
mwmbl/platform/user.py Normal file
View file

@ -0,0 +1,187 @@
import json
import os
from typing import TypeVar, Generic
from urllib.parse import urljoin, parse_qs
import requests
from fastapi import APIRouter, Response
from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
from mwmbl.tokenizer import tokenize
LEMMY_URL = os.environ["LEMMY_URL"]
RESULT_URL = "https://mwmbl.org/?q="
MAX_CURATED_SCORE = 1_111_111.0
class Register(BaseModel):
username: str
email: str
password: str
password_verify: str
class Login(BaseModel):
username_or_email: str
password: str
class Result(BaseModel):
url: str
title: str
extract: str
curated: bool
class BeginCurate(BaseModel):
auth: str
url: str
results: list[Result]
class CurateMove(BaseModel):
old_index: int
new_index: int
class CurateDelete(BaseModel):
delete_index: int
class CurateAdd(BaseModel):
insert_index: int
url: str
class CurateValidate(BaseModel):
validate_index: int
is_validated: bool
T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
class Curation(BaseModel, Generic[T]):
auth: str
curation_id: int
url: str
results: list[Result]
curation: T
def create_router(index_path: str) -> APIRouter:
router = APIRouter(prefix="/user", tags=["user"])
community_id = get_community_id()
@router.post("/register")
def user_register(register: Register) -> Response:
lemmy_register = {
"username": register.username,
"email": register.email,
"password": register.password,
"password_verify": register.password_verify,
"answer": None,
"captcha_answer": None,
"captcha_uuid": None,
"honeypot": None,
"show_nsfw": False,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/register"), json=lemmy_register)
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
@router.post("/login")
def user_login(login: Login) -> Response:
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/login"), json=login.dict())
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
@router.post("/curation/begin")
def user_begin_curate(begin_curate: BeginCurate):
results = begin_curate.dict()["results"]
body = json.dumps({"original_results": results}, indent=2)
create_post = {
"auth": begin_curate.auth,
"body": body,
"community_id": community_id,
"honeypot": None,
"language_id": None,
"name": begin_curate.url,
"nsfw": None,
"url": begin_curate.url,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/post"), json=create_post)
if request.status_code != 200:
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
data = request.json()
curation_id = data["post_view"]["post"]["id"]
return {"curation_id": curation_id}
@router.post("/curation/move")
def user_move_result(curate_move: Curation[CurateMove]):
return _curate("curate_move", curate_move)
@router.post("/curation/delete")
def user_delete_result(curate_delete: Curation[CurateDelete]):
return _curate("curate_delete", curate_delete)
@router.post("/curation/add")
def user_add_result(curate_add: Curation[CurateAdd]):
return _curate("curate_add", curate_add)
@router.post("/curation/validate")
def user_add_result(curate_validate: Curation[CurateValidate]):
return _curate("curate_validate", curate_validate)
def _curate(curation_type: str, curation: Curation):
content = json.dumps({
"curation_type": curation_type,
"curation": curation.curation.dict(),
}, indent=2)
create_comment = {
"auth": curation.auth,
"content": json.dumps(content, indent=2),
"form_id": None,
"language_id": None,
"parent_id": None,
"post_id": curation.curation_id,
}
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
with TinyIndex(Document, index_path, 'w') as indexer:
query_string = parse_qs(curation.url)
if len(query_string) > 1:
raise ValueError(f"Should be one query string in the URL: {curation.url}")
queries = next(iter(query_string.values()))
if len(queries) > 1:
raise ValueError(f"Should be one query value in the URL: {curation.url}")
query = queries[0]
print("Query", query)
tokens = tokenize(query)
print("Tokens", tokens)
term = " ".join(tokens)
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)
print("Page index", page_index)
print("Storing documents", documents)
indexer.store_in_page(page_index, documents)
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
return router
def get_community_id() -> str:
request = requests.get(urljoin(LEMMY_URL, "api/v3/community?name=main"))
community = request.json()
return community["community_view"]["community"]["id"]

View file

@ -23,6 +23,9 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
FILE_NAME_SUFFIX = '.json.gz'
NUM_TITLE_CHARS = 65
NUM_EXTRACT_CHARS = 155
SCORE_FOR_ROOT_PATH = 0.1
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
SCORE_FOR_SAME_DOMAIN = 0.01

View file

@ -1,10 +1,11 @@
import json
import os
from dataclasses import dataclass, asdict
from io import UnsupportedOperation, BytesIO
from dataclasses import dataclass, asdict, field
from enum import IntEnum
from io import UnsupportedOperation
from logging import getLogger
from mmap import mmap, PROT_READ, PROT_WRITE
from typing import TypeVar, Generic, Callable, List
from typing import TypeVar, Generic, Callable, List, Optional
import mmh3
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
@ -20,7 +21,18 @@ logger = getLogger(__name__)
def astuple(dc):
return tuple(dc.__dict__.values())
"""
Convert a type to a tuple - values at the end that are None can be truncated.
"""
value = tuple(dc.__dict__.values())
while value[-1] is None:
value = value[:-1]
return value
class DocumentState(IntEnum):
CURATED = 0
VALIDATED = 1
@dataclass
@ -29,11 +41,13 @@ class Document:
url: str
extract: str
score: float
term: Optional[str] = None
state: Optional[int] = None
@dataclass
class TokenizedDocument(Document):
tokens: List[str]
tokens: List[str] = field(default_factory=list)
T = TypeVar('T')
@ -175,23 +189,6 @@ class TinyIndex(Generic[T]):
# logger.debug(f"Decompressed data: {decompressed_data}")
return json.loads(decompressed_data.decode('utf8'))
def index(self, key: str, value: T):
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self.get_key_page_index(key)
try:
self.add_to_page(page_index, [value])
except PageError:
pass
def add_to_page(self, page_index: int, values: list[T]):
current_page = self._get_page_tuples(page_index)
if current_page is None:
current_page = []
value_tuples = [astuple(value) for value in values]
current_page += value_tuples
self._write_page(current_page, page_index)
def store_in_page(self, page_index: int, values: list[T]):
value_tuples = [astuple(value) for value in values]
self._write_page(value_tuples, page_index)

View file

@ -5,10 +5,12 @@ from logging import getLogger
from operator import itemgetter
from urllib.parse import urlparse
from mwmbl.format import format_result_with_pattern, get_query_regex
from mwmbl.platform.user import MAX_CURATED_SCORE
from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
logger = getLogger(__name__)
@ -21,20 +23,6 @@ DOMAIN_SCORE_SMOOTHING = 50
HTTPS_STRING = 'https://'
def _get_query_regex(terms, is_complete, is_url):
if not terms:
return ''
word_sep = r'\b' if is_url else ''
if is_complete:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
else:
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
rf'{word_sep}{re.escape(terms[-1])}']
pattern = '|'.join(term_patterns)
return pattern
def score_result(terms: list[str], result: Document, is_complete: bool):
features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
@ -93,7 +81,7 @@ def get_domain_score(url):
def get_match_features(terms, result_string, is_complete, is_url):
query_regex = _get_query_regex(terms, is_complete, is_url)
query_regex = get_query_regex(terms, is_complete, is_url)
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
# match_strings = {x.group(0).lower() for x in matches}
# match_length = sum(len(x) for x in match_strings)
@ -135,21 +123,10 @@ class Ranker:
results, terms, _ = self.get_results(s)
is_complete = s.endswith(' ')
pattern = _get_query_regex(terms, is_complete, False)
pattern = get_query_regex(terms, is_complete, False)
formatted_results = []
for result in results:
formatted_result = {}
for content_type, content in [('title', result.title), ('extract', result.extract)]:
matches = re.finditer(pattern, content, re.IGNORECASE)
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
content_result = []
for i in range(len(all_spans) - 1):
is_bold = i % 2 == 1
start = all_spans[i]
end = all_spans[i + 1]
content_result.append({'value': content[start:end], 'is_bold': is_bold})
formatted_result[content_type] = content_result
formatted_result['url'] = result.url
formatted_result = format_result_with_pattern(pattern, result)
formatted_results.append(formatted_result)
logger.info("Return results: %r", formatted_results)
@ -173,6 +150,7 @@ class Ranker:
def get_results(self, q):
terms = tokenize(q)
is_complete = q.endswith(' ')
if len(terms) > 0 and not is_complete:
completions = self.completer.complete(terms[-1])
@ -181,12 +159,23 @@ class Ranker:
completions = []
retrieval_terms = set(terms)
# Check for curation
curation_term = " ".join(terms)
curation_items = self.tiny_index.retrieve(curation_term)
curated_items = [d for d in curation_items if d.state in {DocumentState.CURATED, DocumentState.VALIDATED}
and d.term == curation_term]
if len(curated_items) > 0:
return curated_items, terms, completions
bigrams = set(get_bigrams(len(terms), terms))
pages = []
seen_items = set()
for term in retrieval_terms | bigrams:
items = self.tiny_index.retrieve(term)
if term == curation_term:
items = curation_items
else:
items = self.tiny_index.retrieve(term)
if items is not None:
for item in items:
# if term in item.title.lower() or term in item.extract.lower():

1245
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,7 @@ psycopg2-binary = "^2.9.3"
spacy = "==3.2.1"
pytest = "^7.2.1"
pytest-mock = "^3.10.0"
jusText = "==3.0.0"
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section
# to see which extras to use.
@ -28,7 +29,6 @@ warcio = {version= "==1.7.4", optional = true}
idna = {version= "==3.3", optional = true}
beautifulsoup4 = {version= "==4.10.0", optional = true}
lxml = {version= "==4.6.4", optional = true}
jusText = {version= "==3.0.0", optional = true}
langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true}