Exclude web.archive.org as we're only crawling that right now

This commit is contained in:
Daoud Clarke 2022-08-13 10:52:31 +01:00
parent c6773b46c4
commit fe5eff7b64

View file

@ -23,6 +23,9 @@ from mwmbl.tinysearchengine.indexer import Document, TinyIndex
logger = getLogger(__name__)
EXCLUDED_DOMAINS = {'web.archive.org'}
def get_documents_from_batches(batches: Iterable[HashedBatch]) -> Iterable[tuple[str, str, str]]:
for batch in batches:
for item in batch.items:
@ -123,6 +126,9 @@ def record_urls_in_database(batches: Iterable[HashedBatch]):
crawled_page_domain = urlparse(item.url).netloc
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
for link in item.content.links:
if parsed_link.netloc in EXCLUDED_DOMAINS:
continue
parsed_link = urlparse(link)
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
url_scores[link] += score * score_multiplier