Parcourir la source

Exclude web.archive.org as we're only crawling that right now

Daoud Clarke il y a 2 ans
Parent
commit
fe5eff7b64
1 fichiers modifiés avec 6 ajouts et 0 suppressions
  1. 6 0
      mwmbl/indexer/index_batches.py

+ 6 - 0
mwmbl/indexer/index_batches.py

@@ -23,6 +23,9 @@ from mwmbl.tinysearchengine.indexer import Document, TinyIndex
 logger = getLogger(__name__)
 
 
+EXCLUDED_DOMAINS = {'web.archive.org'}
+
+
 def get_documents_from_batches(batches: Iterable[HashedBatch]) -> Iterable[tuple[str, str, str]]:
     for batch in batches:
         for item in batch.items:
@@ -123,6 +126,9 @@ def record_urls_in_database(batches: Iterable[HashedBatch]):
                     crawled_page_domain = urlparse(item.url).netloc
                     score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                     for link in item.content.links:
+                        if parsed_link.netloc in EXCLUDED_DOMAINS:
+                            continue
+
                         parsed_link = urlparse(link)
                         score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
                         url_scores[link] += score * score_multiplier