Merge pull request #56 from mwmbl/allow-links-from-unknown-domains

Allow crawling links from unknown domains
2022-07-02 13:32:39 +01:00 · 2022-07-02 13:32:39 +01:00 · 6fa192daa4
commit 6fa192daa4
parent 4967830ae1 f9fefa0b62
1 changed files with 14 additions and 10 deletions
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -17,7 +17,7 @@ from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
 from mwmbl.database import Database
 from mwmbl.hn_top_domains_filtered import DOMAINS
-from mwmbl.indexer.indexdb import IndexDatabase
+from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
 from mwmbl.tinysearchengine.indexer import Document

 APPLICATION_KEY = os.environ['MWMBL_APPLICATION_KEY']
@ -35,6 +35,7 @@ FILE_NAME_SUFFIX = '.json.gz'
 SCORE_FOR_ROOT_PATH = 0.1
 SCORE_FOR_DIFFERENT_DOMAIN = 1.0
 SCORE_FOR_SAME_DOMAIN = 0.01
+UNKNOWN_DOMAIN_MULTIPLIER = 0.001


 router = APIRouter(prefix="/crawler", tags=["crawler"])
@ -100,10 +101,18 @@ def create_batch(batch: Batch):
    global last_batch
    last_batch = hashed_batch

+    # Record the batch as being local so that we don't retrieve it again when the server restarts
+    batch_url = f'{PUBLIC_URL_PREFIX}{filename}'
+    infos = [BatchInfo(batch_url, user_id_hash, BatchStatus.LOCAL)]
+
+    with Database() as db:
+        index_db = IndexDatabase(db.connection)
+        index_db.record_batches(infos)
+
    return {
        'status': 'ok',
        'public_user_id': user_id_hash,
-        'url': f'{PUBLIC_URL_PREFIX}{filename}',
+        'url': batch_url,
    }


@ -142,15 +151,13 @@ def record_urls_in_database(batch: Union[Batch, HashedBatch], user_id_hash: str,
        for item in batch.items:
            if item.content is not None:
                crawled_page_domain = urlparse(item.url).netloc
-                if crawled_page_domain not in DOMAINS:
-                    continue
-
+                score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                for link in item.content.links:
                    parsed_link = urlparse(link)
                    score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
-                    url_scores[link] += score
+                    url_scores[link] += score * score_multiplier
                    domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
-                    url_scores[domain] += SCORE_FOR_ROOT_PATH
+                    url_scores[domain] += SCORE_FOR_ROOT_PATH * score_multiplier

        found_urls = [FoundURL(url, user_id_hash, score, URLStatus.NEW, timestamp) for url, score in url_scores.items()]
        if len(found_urls) > 0:
@ -160,9 +167,6 @@ def record_urls_in_database(batch: Union[Batch, HashedBatch], user_id_hash: str,
                        for item in batch.items]
        url_db.update_found_urls(crawled_urls)

-        # TODO:
-        #  - delete existing crawl data for change from INT to FLOAT
-

 def get_batches_for_date(date_str):
    check_date_str(date_str)