Keep track of curated couments

This commit is contained in:
Daoud Clarke 2023-04-30 18:25:48 +01:00
parent f0592f99df
commit 8d64af4f1b
6 changed files with 684 additions and 571 deletions

Binary file not shown.

View file

@ -166,7 +166,7 @@ def create_router(index_path: str) -> APIRouter:
print("Key", term) print("Key", term)
documents = [ documents = [
Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, DocumentState.CURATED.value) Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
for i, result in enumerate(curation.results) for i, result in enumerate(curation.results)
] ]
page_index = indexer.get_key_page_index(term) page_index = indexer.get_key_page_index(term)

View file

@ -31,7 +31,8 @@ def astuple(dc):
class DocumentState(IntEnum): class DocumentState(IntEnum):
CURATED = 1 CURATED = 0
VALIDATED = 1
@dataclass @dataclass

View file

@ -162,7 +162,8 @@ class Ranker:
# Check for curation # Check for curation
curation_term = " ".join(terms) curation_term = " ".join(terms)
curation_items = self.tiny_index.retrieve(curation_term) curation_items = self.tiny_index.retrieve(curation_term)
curated_items = [d for d in curation_items if d.state == DocumentState.CURATED and d.term == curation_term] curated_items = [d for d in curation_items if d.state in {DocumentState.CURATED, DocumentState.VALIDATED}
and d.term == curation_term]
if len(curated_items) > 0: if len(curated_items) > 0:
return curated_items, terms, completions return curated_items, terms, completions

1245
poetry.lock generated

File diff suppressed because it is too large Load diff

View file

@ -20,6 +20,7 @@ psycopg2-binary = "^2.9.3"
spacy = "==3.2.1" spacy = "==3.2.1"
pytest = "^7.2.1" pytest = "^7.2.1"
pytest-mock = "^3.10.0" pytest-mock = "^3.10.0"
jusText = "==3.0.0"
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section # Optional dependencies do not get installed by default. Look under tool.poetry.extras section
# to see which extras to use. # to see which extras to use.
@ -28,7 +29,6 @@ warcio = {version= "==1.7.4", optional = true}
idna = {version= "==3.3", optional = true} idna = {version= "==3.3", optional = true}
beautifulsoup4 = {version= "==4.10.0", optional = true} beautifulsoup4 = {version= "==4.10.0", optional = true}
lxml = {version= "==4.6.4", optional = true} lxml = {version= "==4.6.4", optional = true}
jusText = {version= "==3.0.0", optional = true}
langdetect = {version= "==1.0.9", optional = true} langdetect = {version= "==1.0.9", optional = true}
pyarrow = {version= "==6.0.0", optional = true} pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true} pyspark = {version= "==3.2.0", optional = true}