Exclude web.archive.org as we're only crawling that right now
This commit is contained in:
parent
c6773b46c4
commit
fe5eff7b64
1 changed files with 6 additions and 0 deletions
|
@ -23,6 +23,9 @@ from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
|||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
EXCLUDED_DOMAINS = {'web.archive.org'}
|
||||
|
||||
|
||||
def get_documents_from_batches(batches: Iterable[HashedBatch]) -> Iterable[tuple[str, str, str]]:
|
||||
for batch in batches:
|
||||
for item in batch.items:
|
||||
|
@ -123,6 +126,9 @@ def record_urls_in_database(batches: Iterable[HashedBatch]):
|
|||
crawled_page_domain = urlparse(item.url).netloc
|
||||
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
||||
for link in item.content.links:
|
||||
if parsed_link.netloc in EXCLUDED_DOMAINS:
|
||||
continue
|
||||
|
||||
parsed_link = urlparse(link)
|
||||
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
||||
url_scores[link] += score * score_multiplier
|
||||
|
|
Loading…
Add table
Reference in a new issue