Explorar o código

Merge pull request #114 from mwmbl/exclude-domains-by-keyword

Exclude domains by keyword
Daoud Clarke hai 1 ano
pai
achega
d716cb347f
Modificáronse 3 ficheiros con 55 adicións e 8 borrados
  1. 10 7
      mwmbl/indexer/update_urls.py
  2. 2 1
      mwmbl/settings.py
  3. 43 0
      test/test_update_urls.py

+ 10 - 7
mwmbl/indexer/update_urls.py

@@ -1,5 +1,6 @@
 import os
 import os
 import pickle
 import pickle
+import re
 from collections import defaultdict
 from collections import defaultdict
 from datetime import datetime, timezone, timedelta
 from datetime import datetime, timezone, timedelta
 from logging import getLogger
 from logging import getLogger
@@ -21,7 +22,8 @@ from mwmbl.indexer.index_batches import get_url_error_status
 from mwmbl.indexer.indexdb import BatchStatus
 from mwmbl.indexer.indexdb import BatchStatus
 from mwmbl.indexer.paths import BATCH_DIR_NAME
 from mwmbl.indexer.paths import BATCH_DIR_NAME
 from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
 from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
-    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL
+    SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER, BLACKLIST_DOMAINS_URL, \
+    DOMAIN_BLACKLIST_REGEX
 from mwmbl.utils import get_domain
 from mwmbl.utils import get_domain
 
 
 logger = getLogger(__name__)
 logger = getLogger(__name__)
@@ -69,12 +71,12 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
                         continue
                         continue
                     score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                     score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                     for link in item.content.links:
                     for link in item.content.links:
-                        process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
+                        process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
                                      url_timestamps, url_users, False, blacklist_domains)
                                      url_timestamps, url_users, False, blacklist_domains)
 
 
                     if item.content.extra_links:
                     if item.content.extra_links:
                         for link in item.content.extra_links:
                         for link in item.content.extra_links:
-                            process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
+                            process_link(batch.user_id_hash, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
                                          url_timestamps, url_users, True, blacklist_domains)
                                          url_timestamps, url_users, True, blacklist_domains)
 
 
         found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
         found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
@@ -92,20 +94,21 @@ def get_blacklist_domains():
         return set(response.text.split())
         return set(response.text.split())
 
 
 
 
-def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
+def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
     parsed_link = urlparse(link)
     parsed_link = urlparse(link)
-    if parsed_link.netloc in EXCLUDED_DOMAINS or parsed_link.netloc in blacklist_domains:
+    if parsed_link.netloc in EXCLUDED_DOMAINS or DOMAIN_BLACKLIST_REGEX.search(parsed_link.netloc) is not None \
+            or parsed_link.netloc in blacklist_domains:
         logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
         logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
         return
         return
 
 
     extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
     extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
     score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
     score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
     url_scores[link] += score * unknown_domain_multiplier * extra_multiplier
     url_scores[link] += score * unknown_domain_multiplier * extra_multiplier
-    url_users[link] = batch.user_id_hash
+    url_users[link] = user_id_hash
     url_timestamps[link] = timestamp
     url_timestamps[link] = timestamp
     domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
     domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
     url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier
     url_scores[domain] += SCORE_FOR_ROOT_PATH * unknown_domain_multiplier
-    url_users[domain] = batch.user_id_hash
+    url_users[domain] = user_id_hash
     url_timestamps[domain] = timestamp
     url_timestamps[domain] = timestamp
 
 
 
 

+ 2 - 1
mwmbl/settings.py

@@ -31,7 +31,8 @@ SCORE_FOR_DIFFERENT_DOMAIN = 1.0
 SCORE_FOR_SAME_DOMAIN = 0.01
 SCORE_FOR_SAME_DOMAIN = 0.01
 EXTRA_LINK_MULTIPLIER = 0.001
 EXTRA_LINK_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
 UNKNOWN_DOMAIN_MULTIPLIER = 0.001
-EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com', 'changeporn.com'}
+EXCLUDED_DOMAINS = {'web.archive.org', 'forums.giantitp.com', 'www.crutchfield.com', 'plus.google.com'}
+DOMAIN_BLACKLIST_REGEX = re.compile(r"porn|xxx")
 CORE_DOMAINS = {
 CORE_DOMAINS = {
     'github.com',
     'github.com',
     'en.wikipedia.org',
     'en.wikipedia.org',

+ 43 - 0
test/test_update_urls.py

@@ -0,0 +1,43 @@
+from mwmbl.indexer.update_urls import process_link
+
+
+def test_process_link_normal():
+    url_scores = {"https://somesite.com/something.html": 0.0, "https://somesite.com/": 0.0}
+    url_timestamps = {}
+    url_users = {}
+
+    process_link(
+        user_id_hash="abc123",
+        crawled_page_domain="somewhere.com",
+        link="https://somesite.com/something.html",
+        unknown_domain_multiplier=1,
+        timestamp=1234,
+        url_scores=url_scores,
+        url_timestamps=url_timestamps,
+        url_users=url_users,
+        is_extra=False, blacklist_domains=[]
+    )
+
+    assert url_scores["https://somesite.com/something.html"] > 0.0
+
+
+def test_process_link_excludes_porn():
+    url_scores = {}
+    url_timestamps = {}
+    url_users = {}
+
+    process_link(
+        user_id_hash="abc123",
+        crawled_page_domain="somewhere.com",
+        link="https://somepornsite.com/something.html",
+        unknown_domain_multiplier=1,
+        timestamp=1234,
+        url_scores=url_scores,
+        url_timestamps=url_timestamps,
+        url_users=url_users,
+        is_extra=False, blacklist_domains=[]
+    )
+
+    assert url_scores == {}
+    assert url_timestamps == {}
+    assert url_users == {}