From 41061a695bb7e0057ba34dfd5e9a7850945fc3d5 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Wed, 4 Oct 2023 20:19:42 +0100 Subject: [PATCH] Add tests --- mwmbl/indexer/update_urls.py | 10 ++++----- test/test_update_urls.py | 43 ++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 5 deletions(-) create mode 100644 test/test_update_urls.py diff --git a/mwmbl/indexer/update_urls.py b/mwmbl/indexer/update_urls.py index 7f20a00..3819777 100644 --- a/mwmbl/indexer/update_urls.py +++ b/mwmbl/indexer/update_urls.py @@ -71,12 +71,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu continue score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER for link in item.content.links: - process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores, + process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores, url_timestamps, url_users, False, blacklist_domains) if item.content.extra_links: for link in item.content.extra_links: - process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores, + process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores, url_timestamps, url_users, True, blacklist_domains) found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url]) @@ -94,7 +94,7 @@ def get_blacklist_domains(): return set(response.text.split()) -def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains): +def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains): parsed_link = urlparse(link) if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \ or parsed_link.netloc in blacklist_domains: @@ -104,11 +104,11 @@ def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, ti extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0 score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN url_scores[link] += score * unknown_domain_multiplier * extra_multiplier - url_users[link] = batch.user_id_hash + url_users[link] = user_id_hash url_timestamps[link] = timestamp domain = f'{parsed_link.scheme}://{parsed_link.netloc}/' url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier - url_users[domain] = batch.user_id_hash + url_users[domain] = user_id_hash url_timestamps[domain] = timestamp diff --git a/test/test_update_urls.py b/test/test_update_urls.py new file mode 100644 index 0000000..8f205f8 --- /dev/null +++ b/test/test_update_urls.py @@ -0,0 +1,43 @@ +from mwmbl.indexer.update_urls import process_link + + +def test_process_link_normal(): + url_scores = {"https://somesite.com/something.html": 0.0, "https://somesite.com/": 0.0} + url_timestamps = {} + url_users = {} + + process_link( + user_id_hash="abc123", + crawled_page_domain="somewhere.com", + link="https://somesite.com/something.html", + unknown_domain_multiplier=1, + timestamp=1234, + url_scores=url_scores, + url_timestamps=url_timestamps, + url_users=url_users, + is_extra=False, blacklist_domains=[] + ) + + assert url_scores["https://somesite.com/something.html"] > 0.0 + + +def test_process_link_excludes_porn(): + url_scores = {} + url_timestamps = {} + url_users = {} + + process_link( + user_id_hash="abc123", + crawled_page_domain="somewhere.com", + link="https://somepornsite.com/something.html", + unknown_domain_multiplier=1, + timestamp=1234, + url_scores=url_scores, + url_timestamps=url_timestamps, + url_users=url_users, + is_extra=False, blacklist_domains=[] + ) + + assert url_scores == {} + assert url_timestamps == {} + assert url_users == {}