Browse Source

Add a component of the HN domain score when ranking

Daoud Clarke 3 years ago
parent
commit
e6655101ef

+ 2 - 2
config/tinysearchengine.yaml

@@ -8,5 +8,5 @@ server_config:
 
 index_config:
   index_path: data/index.tinysearch
-  num_pages: 25600
-  page_size: 4096
+  num_pages: 256000
+  page_size: 40960

+ 1 - 1
mwmbl/indexer/domains.py

@@ -11,7 +11,7 @@ import pandas as pd
 
 DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
 ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
-TOP_DOMAINS_PATH = 'hn-top-domains-filtered.py'
+TOP_DOMAINS_PATH = '../tinysearchengine/hn_top_domains_filtered.py'
 
 MIN_COUNT = 10
 PROBABILITY_THRESHOLD = 0.8

+ 0 - 4
mwmbl/indexer/extract.py

@@ -11,10 +11,6 @@ from pyspark.sql.types import StructType, StructField, StringType, LongType, Int
 RECORDS_PATH = 's3://tinysearch/outputs/records'
 OUTPUT_PATH = 's3://tinysearch/outputs/index'
 
-NUM_PAGES = 1024
-MAX_RESULTS_PER_HASH = 200
-PAGE_SIZE = 4096
-
 
 index_schema = StructType([
     StructField("term_hash", LongType(), False),

+ 6 - 1
mwmbl/tinysearchengine/create_app.py

@@ -2,11 +2,13 @@ import re
 from logging import getLogger
 from operator import itemgetter
 from pathlib import Path
+from urllib.parse import urlparse
 
 from fastapi import FastAPI
 from starlette.responses import FileResponse
 from starlette.staticfiles import StaticFiles
 
+from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 
 logger = getLogger(__name__)
@@ -49,6 +51,9 @@ def create(tiny_index: TinyIndex):
         return pattern
 
     def score_result(terms, result: Document):
+        domain = urlparse(result.url).netloc
+        domain_score = DOMAINS.get(domain, 0.0)
+
         result_string = f"{result.title.strip()} {result.extract.strip()}"
         query_regex = get_query_regex(terms)
         matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
@@ -64,7 +69,7 @@ def create(tiny_index: TinyIndex):
                 seen_matches.add(value)
 
         total_possible_match_length = sum(len(x) for x in terms)
-        score = (match_length + 1./last_match_char) / (total_possible_match_length + 1)
+        score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
         return score
 
     def order_results(terms: list[str], results: list[Document]):

+ 0 - 0
mwmbl/indexer/hn-top-domains-filtered.py → mwmbl/tinysearchengine/hn_top_domains_filtered.py