Keep track of curated couments

This commit is contained in:
Daoud Clarke 2023-04-30 18:25:48 +01:00
parent f0592f99df
commit 8d64af4f1b
6 changed files with 684 additions and 571 deletions

Binary file not shown.

View file

@ -166,7 +166,7 @@ def create_router(index_path: str) -> APIRouter:
print("Key", term)
documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, DocumentState.CURATED.value)
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results)
]
page_index = indexer.get_key_page_index(term)

View file

@ -31,7 +31,8 @@ def astuple(dc):
class DocumentState(IntEnum):
CURATED = 1
CURATED = 0
VALIDATED = 1
@dataclass

View file

@ -162,7 +162,8 @@ class Ranker:
# Check for curation
curation_term = " ".join(terms)
curation_items = self.tiny_index.retrieve(curation_term)
curated_items = [d for d in curation_items if d.state == DocumentState.CURATED and d.term == curation_term]
curated_items = [d for d in curation_items if d.state in {DocumentState.CURATED, DocumentState.VALIDATED}
and d.term == curation_term]
if len(curated_items) > 0:
return curated_items, terms, completions

1245
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,7 @@ psycopg2-binary = "^2.9.3"
spacy = "==3.2.1"
pytest = "^7.2.1"
pytest-mock = "^3.10.0"
jusText = "==3.0.0"
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section
# to see which extras to use.
@ -28,7 +29,6 @@ warcio = {version= "==1.7.4", optional = true}
idna = {version= "==3.3", optional = true}
beautifulsoup4 = {version= "==4.10.0", optional = true}
lxml = {version= "==4.6.4", optional = true}
jusText = {version= "==3.0.0", optional = true}
langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true}