Add a component of the HN domain score when ranking
This commit is contained in:
parent
f347fe29ac
commit
e6655101ef
5 changed files with 9 additions and 8 deletions
config
mwmbl
|
@ -8,5 +8,5 @@ server_config:
|
|||
|
||||
index_config:
|
||||
index_path: data/index.tinysearch
|
||||
num_pages: 25600
|
||||
page_size: 4096
|
||||
num_pages: 256000
|
||||
page_size: 40960
|
|
@ -11,7 +11,7 @@ import pandas as pd
|
|||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
|
||||
TOP_DOMAINS_PATH = 'hn-top-domains-filtered.py'
|
||||
TOP_DOMAINS_PATH = '../tinysearchengine/hn_top_domains_filtered.py'
|
||||
|
||||
MIN_COUNT = 10
|
||||
PROBABILITY_THRESHOLD = 0.8
|
||||
|
|
|
@ -11,10 +11,6 @@ from pyspark.sql.types import StructType, StructField, StringType, LongType, Int
|
|||
RECORDS_PATH = 's3://tinysearch/outputs/records'
|
||||
OUTPUT_PATH = 's3://tinysearch/outputs/index'
|
||||
|
||||
NUM_PAGES = 1024
|
||||
MAX_RESULTS_PER_HASH = 200
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
index_schema = StructType([
|
||||
StructField("term_hash", LongType(), False),
|
||||
|
|
|
@ -2,11 +2,13 @@ import re
|
|||
from logging import getLogger
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from fastapi import FastAPI
|
||||
from starlette.responses import FileResponse
|
||||
from starlette.staticfiles import StaticFiles
|
||||
|
||||
from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
@ -49,6 +51,9 @@ def create(tiny_index: TinyIndex):
|
|||
return pattern
|
||||
|
||||
def score_result(terms, result: Document):
|
||||
domain = urlparse(result.url).netloc
|
||||
domain_score = DOMAINS.get(domain, 0.0)
|
||||
|
||||
result_string = f"{result.title.strip()} {result.extract.strip()}"
|
||||
query_regex = get_query_regex(terms)
|
||||
matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
|
||||
|
@ -64,7 +69,7 @@ def create(tiny_index: TinyIndex):
|
|||
seen_matches.add(value)
|
||||
|
||||
total_possible_match_length = sum(len(x) for x in terms)
|
||||
score = (match_length + 1./last_match_char) / (total_possible_match_length + 1)
|
||||
score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
|
||||
return score
|
||||
|
||||
def order_results(terms: list[str], results: list[Document]):
|
||||
|
|
Loading…
Add table
Reference in a new issue