Add a component of the HN domain score when ranking

This commit is contained in:
Daoud Clarke 2021-12-30 22:20:10 +00:00
parent f347fe29ac
commit e6655101ef
5 changed files with 9 additions and 8 deletions

View file

@ -8,5 +8,5 @@ server_config:
index_config:
index_path: data/index.tinysearch
num_pages: 25600
page_size: 4096
num_pages: 256000
page_size: 40960

View file

@ -11,7 +11,7 @@ import pandas as pd
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
TOP_DOMAINS_PATH = 'hn-top-domains-filtered.py'
TOP_DOMAINS_PATH = '../tinysearchengine/hn_top_domains_filtered.py'
MIN_COUNT = 10
PROBABILITY_THRESHOLD = 0.8

View file

@ -11,10 +11,6 @@ from pyspark.sql.types import StructType, StructField, StringType, LongType, Int
RECORDS_PATH = 's3://tinysearch/outputs/records'
OUTPUT_PATH = 's3://tinysearch/outputs/index'
NUM_PAGES = 1024
MAX_RESULTS_PER_HASH = 200
PAGE_SIZE = 4096
index_schema = StructType([
StructField("term_hash", LongType(), False),

View file

@ -2,11 +2,13 @@ import re
from logging import getLogger
from operator import itemgetter
from pathlib import Path
from urllib.parse import urlparse
from fastapi import FastAPI
from starlette.responses import FileResponse
from starlette.staticfiles import StaticFiles
from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
logger = getLogger(__name__)
@ -49,6 +51,9 @@ def create(tiny_index: TinyIndex):
return pattern
def score_result(terms, result: Document):
domain = urlparse(result.url).netloc
domain_score = DOMAINS.get(domain, 0.0)
result_string = f"{result.title.strip()} {result.extract.strip()}"
query_regex = get_query_regex(terms)
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
@ -64,7 +69,7 @@ def create(tiny_index: TinyIndex):
seen_matches.add(value)
total_possible_match_length = sum(len(x) for x in terms)
score = (match_length + 1./last_match_char) / (total_possible_match_length + 1)
score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
return score
def order_results(terms: list[str], results: list[Document]):