Преглед на файлове

Allow posting extra links with lower score weighting

Daoud Clarke преди 2 години
родител
ревизия
2336ed7f7d
променени са 5 файла, в които са добавени 26 реда и са изтрити 14 реда
  1. BIN
      devdata/index-v2.tinysearch
  2. 1 1
      mwmbl/crawler/app.py
  3. 1 0
      mwmbl/crawler/batch.py
  4. 23 13
      mwmbl/indexer/update_urls.py
  5. 1 0
      mwmbl/settings.py

BIN
devdata/index-v2.tinysearch


+ 1 - 1
mwmbl/crawler/app.py

@@ -49,7 +49,7 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
     router = APIRouter(prefix="/crawler", tags=["crawler"])
 
     @router.post('/batches/')
-    def create_batch(batch: Batch):
+    def post_batch(batch: Batch):
         if len(batch.items) > MAX_BATCH_SIZE:
             raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
 

+ 1 - 0
mwmbl/crawler/batch.py

@@ -7,6 +7,7 @@ class ItemContent(BaseModel):
     title: str
     extract: str
     links: list[str]
+    extra_links: Optional[list[str]]
 
 
 class ItemError(BaseModel):

+ 23 - 13
mwmbl/indexer/update_urls.py

@@ -13,8 +13,7 @@ from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.index_batches import get_url_error_status
 from mwmbl.indexer.indexdb import BatchStatus
 from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
-    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH
-
+    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
 
 logger = getLogger(__name__)
 
@@ -43,18 +42,13 @@ def record_urls_in_database(batches: Collection[HashedBatch]):
                     crawled_page_domain = urlparse(item.url).netloc
                     score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                     for link in item.content.links:
-                        parsed_link = urlparse(link)
-                        if parsed_link.netloc in EXCLUDED_DOMAINS:
-                            continue
+                        process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
+                                     url_timestamps, url_users, False)
 
-                        score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
-                        url_scores[link] += score * score_multiplier
-                        url_users[link] = batch.user_id_hash
-                        url_timestamps[link] = timestamp
-                        domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
-                        url_scores[domain] += SCORE_FOR_ROOT_PATH * score_multiplier
-                        url_users[domain] = batch.user_id_hash
-                        url_timestamps[domain] = timestamp
+                    if item.content.extra_links:
+                        for link in item.content.extra_links:
+                            process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
+                                         url_timestamps, url_users, True)
 
         found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
                       for url in url_scores.keys() | url_statuses.keys()]
@@ -62,6 +56,22 @@ def record_urls_in_database(batches: Collection[HashedBatch]):
         url_db.update_found_urls(found_urls)
 
 
+def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):
+    parsed_link = urlparse(link)
+    if parsed_link.netloc in EXCLUDED_DOMAINS:
+        return
+
+    extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
+    score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
+    url_scores[link] += score * unknown_domain_multiplier * extra_multiplier
+    url_users[link] = batch.user_id_hash
+    url_timestamps[link] = timestamp
+    domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
+    url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier
+    url_users[domain] = batch.user_id_hash
+    url_timestamps[domain] = timestamp
+
+
 def get_datetime_from_timestamp(timestamp: float) -> datetime:
     batch_datetime = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=timestamp)
     return batch_datetime

+ 1 - 0
mwmbl/settings.py

@@ -26,5 +26,6 @@ FILE_NAME_SUFFIX = '.json.gz'
 SCORE_FOR_ROOT_PATH = 0.1
 SCORE_FOR_DIFFERENT_DOMAIN = 1.0
 SCORE_FOR_SAME_DOMAIN = 0.01
+EXTRA_LINK_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
 EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com'}