3 роки тому · e6655101ef
--- a/config/tinysearchengine.yaml
+++ b/config/tinysearchengine.yaml
@@ -8,5 +8,5 @@ server_config:
 
				 
			
 
				 index_config:
			
 
				   index_path: data/index.tinysearch
			
 
				-  num_pages: 25600
			
 
				-  page_size: 4096
			
 
				+  num_pages: 256000
			
 
				+  page_size: 40960
			
--- a/mwmbl/indexer/domains.py
+++ b/mwmbl/indexer/domains.py
@@ -11,7 +11,7 @@ import pandas as pd
 
				 
			
 
				 DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
			
 
				 ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
			
 
				-TOP_DOMAINS_PATH = 'hn-top-domains-filtered.py'
			
 
				+TOP_DOMAINS_PATH = '../tinysearchengine/hn_top_domains_filtered.py'
			
 
				 
			
 
				 MIN_COUNT = 10
			
 
				 PROBABILITY_THRESHOLD = 0.8
			
--- a/mwmbl/indexer/extract.py
+++ b/mwmbl/indexer/extract.py
@@ -11,10 +11,6 @@ from pyspark.sql.types import StructType, StructField, StringType, LongType, Int
 
				 RECORDS_PATH = 's3://tinysearch/outputs/records'
			
 
				 OUTPUT_PATH = 's3://tinysearch/outputs/index'
			
 
				 
			
 
				-NUM_PAGES = 1024
			
 
				-MAX_RESULTS_PER_HASH = 200
			
 
				-PAGE_SIZE = 4096
			
 
				-
			
 
				 
			
 
				 index_schema = StructType([
			
 
				     StructField("term_hash", LongType(), False),
			
--- a/mwmbl/tinysearchengine/create_app.py
+++ b/mwmbl/tinysearchengine/create_app.py
@@ -2,11 +2,13 @@ import re
 
				 from logging import getLogger
			
 
				 from operator import itemgetter
			
 
				 from pathlib import Path
			
 
				+from urllib.parse import urlparse
			
 
				 
			
 
				 from fastapi import FastAPI
			
 
				 from starlette.responses import FileResponse
			
 
				 from starlette.staticfiles import StaticFiles
			
 
				 
			
 
				+from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
			
 
				 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
			
 
				 
			
 
				 logger = getLogger(__name__)
			
@@ -49,6 +51,9 @@ def create(tiny_index: TinyIndex):
 
				         return pattern
			
 
				 
			
 
				     def score_result(terms, result: Document):
			
 
				+        domain = urlparse(result.url).netloc
			
 
				+        domain_score = DOMAINS.get(domain, 0.0)
			
 
				+
			
 
				         result_string = f"{result.title.strip()} {result.extract.strip()}"
			
 
				         query_regex = get_query_regex(terms)
			
 
				         matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
			
@@ -64,7 +69,7 @@ def create(tiny_index: TinyIndex):
 
				                 seen_matches.add(value)
			
 
				 
			
 
				         total_possible_match_length = sum(len(x) for x in terms)
			
 
				-        score = (match_length + 1./last_match_char) / (total_possible_match_length + 1)
			
 
				+        score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
			
 
				         return score
			
 
				 
			
 
				     def order_results(terms: list[str], results: list[Document]):
			
--- a/mwmbl/tinysearchengine/hn_top_domains_filtered.py
+++ b/mwmbl/tinysearchengine/hn_top_domains_filtered.py