commit
60980a6bc7
12 changed files with 1013 additions and 636 deletions
Binary file not shown.
|
@ -7,12 +7,17 @@ from typing import Union
|
|||
from uuid import uuid4
|
||||
|
||||
import boto3
|
||||
import justext
|
||||
import requests
|
||||
from fastapi import HTTPException, APIRouter
|
||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.format import format_result
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
|
||||
from mwmbl.settings import (
|
||||
|
@ -26,7 +31,8 @@ from mwmbl.settings import (
|
|||
PUBLIC_URL_PREFIX,
|
||||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX)
|
||||
DATE_REGEX, NUM_EXTRACT_CHARS, NUM_TITLE_CHARS)
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
from mwmbl.url_queue import URLQueue
|
||||
|
||||
|
||||
|
@ -45,9 +51,54 @@ def upload(data: bytes, name: str):
|
|||
last_batch = None
|
||||
|
||||
|
||||
def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
||||
length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT,
|
||||
stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT,
|
||||
max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT,
|
||||
encoding=None, default_encoding=DEFAULT_ENCODING,
|
||||
enc_errors=DEFAULT_ENC_ERRORS):
|
||||
"""
|
||||
Converts an HTML page into a list of classified paragraphs. Each paragraph
|
||||
is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙.
|
||||
"""
|
||||
dom = html_to_dom(html_text, default_encoding, encoding, enc_errors)
|
||||
|
||||
titles = dom.xpath("//title")
|
||||
title = titles[0].text if len(titles) > 0 else None
|
||||
|
||||
dom = preprocessor(dom)
|
||||
|
||||
paragraphs = ParagraphMaker.make_paragraphs(dom)
|
||||
|
||||
classify_paragraphs(paragraphs, stoplist, length_low, length_high,
|
||||
stopwords_low, stopwords_high, max_link_density, no_headings)
|
||||
revise_paragraph_classification(paragraphs, max_heading_distance)
|
||||
|
||||
return paragraphs, title
|
||||
|
||||
|
||||
def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
@router.on_event("startup")
|
||||
async def on_startup():
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
return url_db.create_tables()
|
||||
|
||||
@router.get('/fetch')
|
||||
def fetch_url(url: str, query: str):
|
||||
response = requests.get(url)
|
||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||
|
||||
extract = ' '.join([p.text for p in good_paragraphs])
|
||||
if len(extract) > NUM_EXTRACT_CHARS:
|
||||
extract = extract[:NUM_EXTRACT_CHARS - 1] + '…'
|
||||
|
||||
result = Document(title=title, url=url, extract=extract, score=0.0)
|
||||
return format_result(result, query)
|
||||
|
||||
@router.post('/batches/')
|
||||
def post_batch(batch: Batch):
|
||||
if len(batch.items) > MAX_BATCH_SIZE:
|
||||
|
|
40
mwmbl/format.py
Normal file
40
mwmbl/format.py
Normal file
|
@ -0,0 +1,40 @@
|
|||
import re
|
||||
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
def format_result_with_pattern(pattern, result):
|
||||
formatted_result = {}
|
||||
for content_type, content in [('title', result.title), ('extract', result.extract)]:
|
||||
matches = re.finditer(pattern, content, re.IGNORECASE)
|
||||
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
||||
content_result = []
|
||||
for i in range(len(all_spans) - 1):
|
||||
is_bold = i % 2 == 1
|
||||
start = all_spans[i]
|
||||
end = all_spans[i + 1]
|
||||
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
||||
formatted_result[content_type] = content_result
|
||||
formatted_result['url'] = result.url
|
||||
return formatted_result
|
||||
|
||||
|
||||
def get_query_regex(terms, is_complete, is_url):
|
||||
if not terms:
|
||||
return ''
|
||||
|
||||
word_sep = r'\b' if is_url else ''
|
||||
if is_complete:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
|
||||
else:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
|
||||
rf'{word_sep}{re.escape(terms[-1])}']
|
||||
pattern = '|'.join(term_patterns)
|
||||
return pattern
|
||||
|
||||
|
||||
def format_result(result, query):
|
||||
tokens = tokenize(query)
|
||||
pattern = get_query_regex(tokens, True, False)
|
||||
return format_result_with_pattern(pattern, result)
|
||||
|
|
@ -77,18 +77,3 @@ def tokenize_document(url, title_cleaned, extract, score, nlp):
|
|||
# print("High scoring", len(high_scoring_tokens), token_scores, doc)
|
||||
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
|
||||
return document
|
||||
|
||||
|
||||
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
|
||||
terms = Counter()
|
||||
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
|
||||
for page in pages:
|
||||
for token in page.tokens:
|
||||
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
|
||||
terms.update([t.lower() for t in page.tokens])
|
||||
|
||||
term_df = pd.DataFrame({
|
||||
'term': terms.keys(),
|
||||
'count': terms.values(),
|
||||
})
|
||||
term_df.to_csv(terms_path)
|
||||
|
|
|
@ -6,11 +6,13 @@ from pathlib import Path
|
|||
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
|
||||
from mwmbl import background
|
||||
from mwmbl.crawler import app as crawler
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.platform import user
|
||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
|
@ -51,6 +53,7 @@ def run():
|
|||
|
||||
new_item_queue = Queue()
|
||||
queued_batches = Queue()
|
||||
# curation_queue = Queue()
|
||||
|
||||
if args.background:
|
||||
Process(target=background.run, args=(args.data,)).start()
|
||||
|
@ -67,6 +70,14 @@ def run():
|
|||
# Initialize FastApi instance
|
||||
app = FastAPI()
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
search_router = search.create_router(ranker)
|
||||
app.include_router(search_router)
|
||||
|
||||
|
@ -74,6 +85,9 @@ def run():
|
|||
crawler_router = crawler.get_router(batch_cache, queued_batches)
|
||||
app.include_router(crawler_router)
|
||||
|
||||
user_router = user.create_router(index_path)
|
||||
app.include_router(user_router)
|
||||
|
||||
# Initialize uvicorn server using global app instance and server config params
|
||||
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
||||
|
||||
|
|
0
mwmbl/platform/__init__.py
Normal file
0
mwmbl/platform/__init__.py
Normal file
187
mwmbl/platform/user.py
Normal file
187
mwmbl/platform/user.py
Normal file
|
@ -0,0 +1,187 @@
|
|||
import json
|
||||
import os
|
||||
from typing import TypeVar, Generic
|
||||
from urllib.parse import urljoin, parse_qs
|
||||
|
||||
import requests
|
||||
from fastapi import APIRouter, Response
|
||||
from pydantic import BaseModel
|
||||
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
||||
from mwmbl.tokenizer import tokenize
|
||||
|
||||
|
||||
LEMMY_URL = os.environ["LEMMY_URL"]
|
||||
RESULT_URL = "https://mwmbl.org/?q="
|
||||
MAX_CURATED_SCORE = 1_111_111.0
|
||||
|
||||
|
||||
class Register(BaseModel):
|
||||
username: str
|
||||
email: str
|
||||
password: str
|
||||
password_verify: str
|
||||
|
||||
|
||||
class Login(BaseModel):
|
||||
username_or_email: str
|
||||
password: str
|
||||
|
||||
|
||||
class Result(BaseModel):
|
||||
url: str
|
||||
title: str
|
||||
extract: str
|
||||
curated: bool
|
||||
|
||||
|
||||
class BeginCurate(BaseModel):
|
||||
auth: str
|
||||
url: str
|
||||
results: list[Result]
|
||||
|
||||
|
||||
class CurateMove(BaseModel):
|
||||
old_index: int
|
||||
new_index: int
|
||||
|
||||
|
||||
class CurateDelete(BaseModel):
|
||||
delete_index: int
|
||||
|
||||
|
||||
class CurateAdd(BaseModel):
|
||||
insert_index: int
|
||||
url: str
|
||||
|
||||
|
||||
class CurateValidate(BaseModel):
|
||||
validate_index: int
|
||||
is_validated: bool
|
||||
|
||||
|
||||
T = TypeVar('T', CurateAdd, CurateDelete, CurateMove, CurateValidate)
|
||||
|
||||
|
||||
class Curation(BaseModel, Generic[T]):
|
||||
auth: str
|
||||
curation_id: int
|
||||
url: str
|
||||
results: list[Result]
|
||||
curation: T
|
||||
|
||||
|
||||
def create_router(index_path: str) -> APIRouter:
|
||||
router = APIRouter(prefix="/user", tags=["user"])
|
||||
|
||||
community_id = get_community_id()
|
||||
|
||||
@router.post("/register")
|
||||
def user_register(register: Register) -> Response:
|
||||
lemmy_register = {
|
||||
"username": register.username,
|
||||
"email": register.email,
|
||||
"password": register.password,
|
||||
"password_verify": register.password_verify,
|
||||
"answer": None,
|
||||
"captcha_answer": None,
|
||||
"captcha_uuid": None,
|
||||
"honeypot": None,
|
||||
"show_nsfw": False,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/register"), json=lemmy_register)
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
@router.post("/login")
|
||||
def user_login(login: Login) -> Response:
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/user/login"), json=login.dict())
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
@router.post("/curation/begin")
|
||||
def user_begin_curate(begin_curate: BeginCurate):
|
||||
results = begin_curate.dict()["results"]
|
||||
body = json.dumps({"original_results": results}, indent=2)
|
||||
create_post = {
|
||||
"auth": begin_curate.auth,
|
||||
"body": body,
|
||||
"community_id": community_id,
|
||||
"honeypot": None,
|
||||
"language_id": None,
|
||||
"name": begin_curate.url,
|
||||
"nsfw": None,
|
||||
"url": begin_curate.url,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/post"), json=create_post)
|
||||
if request.status_code != 200:
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
data = request.json()
|
||||
curation_id = data["post_view"]["post"]["id"]
|
||||
return {"curation_id": curation_id}
|
||||
|
||||
@router.post("/curation/move")
|
||||
def user_move_result(curate_move: Curation[CurateMove]):
|
||||
return _curate("curate_move", curate_move)
|
||||
|
||||
@router.post("/curation/delete")
|
||||
def user_delete_result(curate_delete: Curation[CurateDelete]):
|
||||
return _curate("curate_delete", curate_delete)
|
||||
|
||||
@router.post("/curation/add")
|
||||
def user_add_result(curate_add: Curation[CurateAdd]):
|
||||
return _curate("curate_add", curate_add)
|
||||
|
||||
@router.post("/curation/validate")
|
||||
def user_add_result(curate_validate: Curation[CurateValidate]):
|
||||
return _curate("curate_validate", curate_validate)
|
||||
|
||||
def _curate(curation_type: str, curation: Curation):
|
||||
content = json.dumps({
|
||||
"curation_type": curation_type,
|
||||
"curation": curation.curation.dict(),
|
||||
}, indent=2)
|
||||
create_comment = {
|
||||
"auth": curation.auth,
|
||||
"content": json.dumps(content, indent=2),
|
||||
"form_id": None,
|
||||
"language_id": None,
|
||||
"parent_id": None,
|
||||
"post_id": curation.curation_id,
|
||||
}
|
||||
request = requests.post(urljoin(LEMMY_URL, "api/v3/comment"), json=create_comment)
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
query_string = parse_qs(curation.url)
|
||||
if len(query_string) > 1:
|
||||
raise ValueError(f"Should be one query string in the URL: {curation.url}")
|
||||
|
||||
queries = next(iter(query_string.values()))
|
||||
if len(queries) > 1:
|
||||
raise ValueError(f"Should be one query value in the URL: {curation.url}")
|
||||
|
||||
query = queries[0]
|
||||
print("Query", query)
|
||||
tokens = tokenize(query)
|
||||
print("Tokens", tokens)
|
||||
term = " ".join(tokens)
|
||||
print("Key", term)
|
||||
|
||||
documents = [
|
||||
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
|
||||
for i, result in enumerate(curation.results)
|
||||
]
|
||||
page_index = indexer.get_key_page_index(term)
|
||||
print("Page index", page_index)
|
||||
print("Storing documents", documents)
|
||||
indexer.store_in_page(page_index, documents)
|
||||
|
||||
return Response(content=request.content, status_code=request.status_code, media_type="text/json")
|
||||
|
||||
return router
|
||||
|
||||
|
||||
def get_community_id() -> str:
|
||||
request = requests.get(urljoin(LEMMY_URL, "api/v3/community?name=main"))
|
||||
community = request.json()
|
||||
return community["community_view"]["community"]["id"]
|
||||
|
||||
|
|
@ -23,6 +23,9 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
|
|||
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
|
||||
FILE_NAME_SUFFIX = '.json.gz'
|
||||
|
||||
NUM_TITLE_CHARS = 65
|
||||
NUM_EXTRACT_CHARS = 155
|
||||
|
||||
SCORE_FOR_ROOT_PATH = 0.1
|
||||
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
||||
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, asdict
|
||||
from io import UnsupportedOperation, BytesIO
|
||||
from dataclasses import dataclass, asdict, field
|
||||
from enum import IntEnum
|
||||
from io import UnsupportedOperation
|
||||
from logging import getLogger
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE
|
||||
from typing import TypeVar, Generic, Callable, List
|
||||
from typing import TypeVar, Generic, Callable, List, Optional
|
||||
|
||||
import mmh3
|
||||
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
||||
|
@ -20,7 +21,18 @@ logger = getLogger(__name__)
|
|||
|
||||
|
||||
def astuple(dc):
|
||||
return tuple(dc.__dict__.values())
|
||||
"""
|
||||
Convert a type to a tuple - values at the end that are None can be truncated.
|
||||
"""
|
||||
value = tuple(dc.__dict__.values())
|
||||
while value[-1] is None:
|
||||
value = value[:-1]
|
||||
return value
|
||||
|
||||
|
||||
class DocumentState(IntEnum):
|
||||
CURATED = 0
|
||||
VALIDATED = 1
|
||||
|
||||
|
||||
@dataclass
|
||||
|
@ -29,11 +41,13 @@ class Document:
|
|||
url: str
|
||||
extract: str
|
||||
score: float
|
||||
term: Optional[str] = None
|
||||
state: Optional[int] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenizedDocument(Document):
|
||||
tokens: List[str]
|
||||
tokens: List[str] = field(default_factory=list)
|
||||
|
||||
|
||||
T = TypeVar('T')
|
||||
|
@ -175,23 +189,6 @@ class TinyIndex(Generic[T]):
|
|||
# logger.debug(f"Decompressed data: {decompressed_data}")
|
||||
return json.loads(decompressed_data.decode('utf8'))
|
||||
|
||||
def index(self, key: str, value: T):
|
||||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
f" ({self.item_factory.__name__})"
|
||||
page_index = self.get_key_page_index(key)
|
||||
try:
|
||||
self.add_to_page(page_index, [value])
|
||||
except PageError:
|
||||
pass
|
||||
|
||||
def add_to_page(self, page_index: int, values: list[T]):
|
||||
current_page = self._get_page_tuples(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
value_tuples = [astuple(value) for value in values]
|
||||
current_page += value_tuples
|
||||
self._write_page(current_page, page_index)
|
||||
|
||||
def store_in_page(self, page_index: int, values: list[T]):
|
||||
value_tuples = [astuple(value) for value in values]
|
||||
self._write_page(value_tuples, page_index)
|
||||
|
|
|
@ -5,10 +5,12 @@ from logging import getLogger
|
|||
from operator import itemgetter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||
from mwmbl.platform.user import MAX_CURATED_SCORE
|
||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
@ -21,20 +23,6 @@ DOMAIN_SCORE_SMOOTHING = 50
|
|||
HTTPS_STRING = 'https://'
|
||||
|
||||
|
||||
def _get_query_regex(terms, is_complete, is_url):
|
||||
if not terms:
|
||||
return ''
|
||||
|
||||
word_sep = r'\b' if is_url else ''
|
||||
if is_complete:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms]
|
||||
else:
|
||||
term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [
|
||||
rf'{word_sep}{re.escape(terms[-1])}']
|
||||
pattern = '|'.join(term_patterns)
|
||||
return pattern
|
||||
|
||||
|
||||
def score_result(terms: list[str], result: Document, is_complete: bool):
|
||||
features = get_features(terms, result.title, result.url, result.extract, result.score, is_complete)
|
||||
|
||||
|
@ -93,7 +81,7 @@ def get_domain_score(url):
|
|||
|
||||
|
||||
def get_match_features(terms, result_string, is_complete, is_url):
|
||||
query_regex = _get_query_regex(terms, is_complete, is_url)
|
||||
query_regex = get_query_regex(terms, is_complete, is_url)
|
||||
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
||||
# match_strings = {x.group(0).lower() for x in matches}
|
||||
# match_length = sum(len(x) for x in match_strings)
|
||||
|
@ -135,21 +123,10 @@ class Ranker:
|
|||
results, terms, _ = self.get_results(s)
|
||||
|
||||
is_complete = s.endswith(' ')
|
||||
pattern = _get_query_regex(terms, is_complete, False)
|
||||
pattern = get_query_regex(terms, is_complete, False)
|
||||
formatted_results = []
|
||||
for result in results:
|
||||
formatted_result = {}
|
||||
for content_type, content in [('title', result.title), ('extract', result.extract)]:
|
||||
matches = re.finditer(pattern, content, re.IGNORECASE)
|
||||
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
||||
content_result = []
|
||||
for i in range(len(all_spans) - 1):
|
||||
is_bold = i % 2 == 1
|
||||
start = all_spans[i]
|
||||
end = all_spans[i + 1]
|
||||
content_result.append({'value': content[start:end], 'is_bold': is_bold})
|
||||
formatted_result[content_type] = content_result
|
||||
formatted_result['url'] = result.url
|
||||
formatted_result = format_result_with_pattern(pattern, result)
|
||||
formatted_results.append(formatted_result)
|
||||
|
||||
logger.info("Return results: %r", formatted_results)
|
||||
|
@ -173,6 +150,7 @@ class Ranker:
|
|||
|
||||
def get_results(self, q):
|
||||
terms = tokenize(q)
|
||||
|
||||
is_complete = q.endswith(' ')
|
||||
if len(terms) > 0 and not is_complete:
|
||||
completions = self.completer.complete(terms[-1])
|
||||
|
@ -181,12 +159,23 @@ class Ranker:
|
|||
completions = []
|
||||
retrieval_terms = set(terms)
|
||||
|
||||
# Check for curation
|
||||
curation_term = " ".join(terms)
|
||||
curation_items = self.tiny_index.retrieve(curation_term)
|
||||
curated_items = [d for d in curation_items if d.state in {DocumentState.CURATED, DocumentState.VALIDATED}
|
||||
and d.term == curation_term]
|
||||
if len(curated_items) > 0:
|
||||
return curated_items, terms, completions
|
||||
|
||||
bigrams = set(get_bigrams(len(terms), terms))
|
||||
|
||||
pages = []
|
||||
seen_items = set()
|
||||
for term in retrieval_terms | bigrams:
|
||||
items = self.tiny_index.retrieve(term)
|
||||
if term == curation_term:
|
||||
items = curation_items
|
||||
else:
|
||||
items = self.tiny_index.retrieve(term)
|
||||
if items is not None:
|
||||
for item in items:
|
||||
# if term in item.title.lower() or term in item.extract.lower():
|
||||
|
|
1245
poetry.lock
generated
1245
poetry.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -20,6 +20,7 @@ psycopg2-binary = "^2.9.3"
|
|||
spacy = "==3.2.1"
|
||||
pytest = "^7.2.1"
|
||||
pytest-mock = "^3.10.0"
|
||||
jusText = "==3.0.0"
|
||||
|
||||
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section
|
||||
# to see which extras to use.
|
||||
|
@ -28,7 +29,6 @@ warcio = {version= "==1.7.4", optional = true}
|
|||
idna = {version= "==3.3", optional = true}
|
||||
beautifulsoup4 = {version= "==4.10.0", optional = true}
|
||||
lxml = {version= "==4.6.4", optional = true}
|
||||
jusText = {version= "==3.0.0", optional = true}
|
||||
langdetect = {version= "==1.0.9", optional = true}
|
||||
pyarrow = {version= "==6.0.0", optional = true}
|
||||
pyspark = {version= "==3.2.0", optional = true}
|
||||
|
|
Loading…
Add table
Reference in a new issue