Browse Source

Merge pull request #89 from mwmbl/update-urls-queue-quickly

Update urls queue quickly
Daoud Clarke 2 năm trước cách đây
mục cha
commit
5616626fc1

+ 26 - 0
analyse/update_urls.py

@@ -0,0 +1,26 @@
+import os
+import pickle
+from datetime import datetime
+from pathlib import Path
+from queue import Queue
+
+from mwmbl.indexer.update_urls import record_urls_in_database
+
+
+def run_update_urls_on_fixed_batches():
+    with open(Path(os.environ["HOME"]) / "data" / "mwmbl" / "hashed-batches.pickle", "rb") as file:
+        batches = pickle.load(file)
+
+    # print("Batches", batches[:3])
+
+    queue = Queue()
+
+    start = datetime.now()
+    record_urls_in_database(batches, queue)
+    total_time = (datetime.now() - start).total_seconds()
+
+    print("Total time:", total_time)
+
+
+if __name__ == '__main__':
+    run_update_urls_on_fixed_batches()

+ 35 - 0
analyse/url_queue.py

@@ -0,0 +1,35 @@
+import logging
+import os
+import pickle
+import sys
+from datetime import datetime
+from pathlib import Path
+from queue import Queue
+
+from mwmbl.url_queue import URLQueue
+
+FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG, format=FORMAT)
+
+
+def run_url_queue():
+    data = pickle.load(open(Path(os.environ["HOME"]) / "data" / "mwmbl" / "found-urls.pickle", "rb"))
+    print("First URLs", [x.url for x in data[:1000]])
+
+    new_item_queue = Queue()
+    queued_batches = Queue()
+    queue = URLQueue(new_item_queue, queued_batches)
+
+    new_item_queue.put(data)
+
+    start = datetime.now()
+    queue.update()
+    total_time = (datetime.now() - start).total_seconds()
+    print(f"Total time: {total_time}")
+
+
+
+
+
+if __name__ == '__main__':
+    run_url_queue()

BIN
devdata/index-v2.tinysearch


+ 1 - 5
mwmbl/background.py

@@ -7,7 +7,7 @@ from time import sleep
 
 from mwmbl.crawler.urls import URLDatabase
 from mwmbl.database import Database
-from mwmbl.indexer import index_batches, historical, update_urls
+from mwmbl.indexer import index_batches, historical
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
 
@@ -30,10 +30,6 @@ def run(data_path: str):
             batch_cache.retrieve_batches(num_batches=10000)
         except Exception:
             logger.exception("Error retrieving batches")
-        try:
-            update_urls.run(batch_cache)
-        except Exception:
-            logger.exception("Error updating URLs")
         try:
             index_batches.run(batch_cache, index_path)
         except Exception:

+ 4 - 4
mwmbl/crawler/app.py

@@ -2,8 +2,7 @@ import gzip
 import hashlib
 import json
 from datetime import datetime, timezone, date
-from multiprocessing import Queue
-from queue import Empty
+from queue import Queue, Empty
 from typing import Union
 from uuid import uuid4
 
@@ -28,6 +27,7 @@ from mwmbl.settings import (
     PUBLIC_USER_ID_LENGTH,
     FILE_NAME_SUFFIX,
     DATE_REGEX)
+from mwmbl.url_queue import URLQueue
 
 
 def get_bucket(name):
@@ -45,7 +45,7 @@ def upload(data: bytes, name: str):
 last_batch = None
 
 
-def get_router(batch_cache: BatchCache, url_queue: Queue):
+def get_router(batch_cache: BatchCache, queued_batches: Queue):
     router = APIRouter(prefix="/crawler", tags=["crawler"])
 
     @router.post('/batches/')
@@ -103,7 +103,7 @@ def get_router(batch_cache: BatchCache, url_queue: Queue):
     def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
         user_id_hash = _get_user_id_hash(batch_request)
         try:
-            urls = url_queue.get(block=False)
+            urls = queued_batches.get(block=False)
         except Empty:
             return []
 

+ 12 - 79
mwmbl/crawler/urls.py

@@ -64,23 +64,14 @@ class URLDatabase:
         )
         """
 
-        index_sql = """
-        CREATE INDEX IF NOT EXISTS host_index
-            ON urls(substring(url FROM '.*://([^/]*)'), score)
-        """
-
-        view_sql = """
-        CREATE OR REPLACE VIEW url_and_hosts AS SELECT *, substring(url FROM '.*://([^/]*)') AS host FROM urls
-        """
-
         with self.connection.cursor() as cursor:
             cursor.execute(sql)
-            cursor.execute(index_sql)
-            cursor.execute(view_sql)
+            # cursor.execute(index_sql)
+            # cursor.execute(view_sql)
 
-    def update_found_urls(self, found_urls: list[FoundURL]):
+    def update_found_urls(self, found_urls: list[FoundURL]) -> list[FoundURL]:
         if len(found_urls) == 0:
-            return
+            return []
 
         get_urls_sql = """
           SELECT url FROM urls
@@ -104,6 +95,7 @@ class URLDatabase:
            updated = CASE
              WHEN urls.status > excluded.status THEN urls.updated ELSE excluded.updated
            END
+        RETURNING url, user_id_hash, score, status, updated
         """
 
         input_urls = [x.url for x in found_urls]
@@ -111,6 +103,7 @@ class URLDatabase:
 
         with self.connection as connection:
             with connection.cursor() as cursor:
+                logger.info(f"Input URLs: {len(input_urls)}")
                 cursor.execute(get_urls_sql, {'urls': tuple(input_urls)})
                 existing_urls = {x[0] for x in cursor.fetchall()}
                 new_urls = set(input_urls) - existing_urls
@@ -119,6 +112,7 @@ class URLDatabase:
                 locked_urls = {x[0] for x in cursor.fetchall()}
 
                 urls_to_insert = new_urls | locked_urls
+                logger.info(f"URLs to insert: {len(urls_to_insert)}")
 
                 if len(urls_to_insert) != len(input_urls):
                     print(f"Only got {len(urls_to_insert)} instead of {len(input_urls)} - {len(new_urls)} new")
@@ -128,72 +122,11 @@ class URLDatabase:
                     (found_url.url, found_url.status.value, found_url.user_id_hash, found_url.score, found_url.timestamp)
                     for found_url in sorted_urls if found_url.url in urls_to_insert]
 
-                execute_values(cursor, insert_sql, data)
-
-    def get_urls_for_crawling(self):
-        start = datetime.utcnow()
-        logger.info("Getting URLs for crawling")
-
-        work_mem = "SET work_mem = '512MB'"
-
-        select_sql = f"""
-            SELECT host, (array_agg(url order by score desc))[:{MAX_URLS_PER_TOP_DOMAIN}] FROM url_and_hosts
-            WHERE host IN %(domains)s
-                AND (status = {URLStatus.NEW.value} OR (
-                    status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
-                ))
-            GROUP BY host
-        """
-
-        others_sql = f"""
-            SELECT DISTINCT ON (host) url FROM (
-                SELECT * FROM url_and_hosts
-                WHERE status = {URLStatus.NEW.value} OR (
-                   status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
-                )
-                ORDER BY score DESC LIMIT {MAX_OTHER_DOMAINS}) u
-            ORDER BY host
-        """
-
-        update_sql = f"""
-            UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
-            WHERE url IN %(urls)s
-        """
-
-        now = datetime.utcnow()
-        min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
-        domain_sample = set(random.sample(DOMAINS.keys(), MAX_TOP_DOMAINS)) | CORE_DOMAINS
-        domains = tuple(domain_sample)
-        logger.info(f"Getting URLs for domains {domains}")
-        with self.connection.cursor() as cursor:
-            cursor.execute(work_mem)
-            cursor.execute(select_sql,
-                           {'min_updated_date': min_updated_date, 'domains': domains})
-            agg_results = cursor.fetchall()
-            logger.info(f"Agg results: {agg_results}")
-
-        results = []
-        for host, urls in agg_results:
-            results += urls
-
-        logger.info(f"Got {len(results)} top domain results")
-
-        with self.connection.cursor() as cursor:
-            cursor.execute(others_sql, {'min_updated_date': min_updated_date})
-            other_results = cursor.fetchall()
-            other_results_list = [result[0] for result in other_results]
-            logger.info(f"Got {len(other_results_list)} results from all domains")
-            results += other_results_list
-
-        with self.connection.cursor() as cursor:
-            cursor.execute(update_sql,
-                           {'now': now, 'urls': tuple(results)})
-
-        total_time_seconds = (datetime.now() - start).total_seconds()
-        logger.info(f"Got {len(results)} in {total_time_seconds} seconds")
-
-        random.shuffle(results)
-        return results
+                logger.info(f"Data: {len(data)}")
+                results = execute_values(cursor, insert_sql, data, fetch=True)
+                logger.info(f"Results: {len(results)}")
+                updated = [FoundURL(*result) for result in results]
+                return updated
 
     def get_urls(self, status: URLStatus, num_urls: int):
         sql = f"""

+ 4 - 4
mwmbl/indexer/batch_cache.py

@@ -51,7 +51,7 @@ class BatchCache:
         with Database() as db:
             index_db = IndexDatabase(db.connection)
             batches = index_db.get_batches_by_status(BatchStatus.REMOTE, num_batches)
-            print(f"Found {len(batches)} remote batches")
+            logger.info(f"Found {len(batches)} remote batches")
             if len(batches) == 0:
                 return
             urls = [batch.url for batch in batches]
@@ -60,7 +60,7 @@ class BatchCache:
             total_processed = 0
             for result in results:
                 total_processed += result
-            print("Processed batches with items:", total_processed)
+            logger.info(f"Processed batches with {total_processed} items")
             index_db.update_batch_status(urls, BatchStatus.LOCAL)
 
     def retrieve_batch(self, url):
@@ -68,7 +68,7 @@ class BatchCache:
         try:
             batch = HashedBatch.parse_obj(data)
         except ValidationError:
-            print("Failed to validate batch", data)
+            logger.info(f"Failed to validate batch {data}")
             return 0
         if len(batch.items) > 0:
             self.store(batch, url)
@@ -76,7 +76,7 @@ class BatchCache:
 
     def store(self, batch, url):
         path = self.get_path_from_url(url)
-        print(f"Storing local batch at {path}")
+        logger.debug(f"Storing local batch at {path}")
         os.makedirs(path.parent, exist_ok=True)
         with open(path, 'wb') as output_file:
             data = gzip.compress(batch.json().encode('utf8'))

+ 6 - 2
mwmbl/indexer/process_batch.py

@@ -10,7 +10,7 @@ logger = getLogger(__name__)
 
 
 def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchStatus,
-        process: Callable[[Collection[HashedBatch]], None]):
+        process: Callable[[Collection[HashedBatch], ...], None], *args):
 
     with Database() as db:
         index_db = IndexDatabase(db.connection)
@@ -24,6 +24,10 @@ def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchSta
         batch_data = batch_cache.get_cached([batch.url for batch in batches])
         logger.info(f"Got {len(batch_data)} cached batches")
 
-        process(batch_data.values())
+        missing_batches = {batch.url for batch in batches} - batch_data.keys()
+        logger.info(f"Got {len(missing_batches)} missing batches")
+        index_db.update_batch_status(list(missing_batches), BatchStatus.REMOTE)
+
+        process(batch_data.values(), *args)
 
         index_db.update_batch_status(list(batch_data.keys()), end_status)

+ 29 - 5
mwmbl/indexer/update_urls.py

@@ -1,6 +1,11 @@
+import os
+import pickle
 from collections import defaultdict
 from datetime import datetime, timezone, timedelta
 from logging import getLogger
+from multiprocessing import Queue
+from pathlib import Path
+from time import sleep
 from typing import Iterable, Collection
 from urllib.parse import urlparse
 
@@ -12,17 +17,29 @@ from mwmbl.indexer import process_batch
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.index_batches import get_url_error_status
 from mwmbl.indexer.indexdb import BatchStatus
+from mwmbl.indexer.paths import BATCH_DIR_NAME
 from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
     SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH, EXTRA_LINK_MULTIPLIER
+from mwmbl.utils import get_domain
 
 logger = getLogger(__name__)
 
 
-def run(batch_cache: BatchCache):
-    process_batch.run(batch_cache, BatchStatus.LOCAL, BatchStatus.URLS_UPDATED, process=record_urls_in_database)
+def update_urls_continuously(data_path: str, new_item_queue: Queue):
+    batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
+    while True:
+        try:
+            run(batch_cache, new_item_queue)
+        except Exception:
+            logger.exception("Error updating URLs")
+        sleep(10)
 
 
-def record_urls_in_database(batches: Collection[HashedBatch]):
+def run(batch_cache: BatchCache, new_item_queue: Queue):
+    process_batch.run(batch_cache, BatchStatus.LOCAL, BatchStatus.URLS_UPDATED, record_urls_in_database, new_item_queue)
+
+
+def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Queue):
     logger.info(f"Recording URLs in database for {len(batches)} batches")
     with Database() as db:
         url_db = URLDatabase(db.connection)
@@ -39,7 +56,11 @@ def record_urls_in_database(batches: Collection[HashedBatch]):
                     url_statuses[item.url] = get_url_error_status(item)
                 else:
                     url_statuses[item.url] = URLStatus.CRAWLED
-                    crawled_page_domain = urlparse(item.url).netloc
+                    try:
+                        crawled_page_domain = get_domain(item.url)
+                    except ValueError:
+                        logger.info(f"Couldn't parse URL {item.url}")
+                        continue
                     score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
                     for link in item.content.links:
                         process_link(batch, crawled_page_domain, link, score_multiplier, timestamp, url_scores,
@@ -53,7 +74,10 @@ def record_urls_in_database(batches: Collection[HashedBatch]):
         found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
                       for url in url_scores.keys() | url_statuses.keys()]
 
-        url_db.update_found_urls(found_urls)
+        logger.info(f"Found URLs, {len(found_urls)}")
+        urls = url_db.update_found_urls(found_urls)
+        new_item_queue.put(urls)
+        logger.info(f"Put {len(urls)} new items in the URL queue")
 
 
 def process_link(batch, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool):

+ 10 - 6
mwmbl/main.py

@@ -1,6 +1,5 @@
 import argparse
 import logging
-import os
 import sys
 from multiprocessing import Process, Queue
 from pathlib import Path
@@ -8,16 +7,19 @@ from pathlib import Path
 import uvicorn
 from fastapi import FastAPI
 
-from mwmbl import background, url_queue
+from mwmbl import background
 from mwmbl.crawler import app as crawler
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
+from mwmbl.indexer.update_urls import update_urls_continuously
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
 from mwmbl.tinysearchengine.rank import HeuristicRanker
+from mwmbl.url_queue import update_queue_continuously
 
-logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
+logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
 
 
 MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
@@ -47,11 +49,13 @@ def run():
         print("Creating a new index")
         TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
 
-    queue = Queue()
+    new_item_queue = Queue()
+    queued_batches = Queue()
 
     if args.background:
         Process(target=background.run, args=(args.data,)).start()
-        Process(target=url_queue.run, args=(queue,)).start()
+        Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
+        Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
 
     completer = Completer()
 
@@ -67,7 +71,7 @@ def run():
         app.include_router(search_router)
 
         batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
-        crawler_router = crawler.get_router(batch_cache, queue)
+        crawler_router = crawler.get_router(batch_cache, queued_batches)
         app.include_router(crawler_router)
 
         # Initialize uvicorn server using global app instance and server config params

+ 133 - 30
mwmbl/url_queue.py

@@ -1,48 +1,151 @@
+import time
+from collections import defaultdict
+from dataclasses import dataclass
+from datetime import datetime, timedelta
 from logging import getLogger
 from multiprocessing import Queue
-from time import sleep
+from queue import Empty
+from typing import KeysView, Union
 
-from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus
+from mwmbl.crawler.urls import BATCH_SIZE, URLDatabase, URLStatus, FoundURL, REASSIGN_MIN_HOURS
 from mwmbl.database import Database
-from mwmbl.utils import batch
-
+from mwmbl.hn_top_domains_filtered import DOMAINS as TOP_DOMAINS
+from mwmbl.settings import CORE_DOMAINS
+from mwmbl.utils import batch, get_domain
 
 logger = getLogger(__name__)
 
 
 MAX_QUEUE_SIZE = 5000
-MIN_QUEUE_SIZE = 1000
 
+MAX_TOP_URLS = 100000
+MAX_OTHER_URLS = 1000
+MAX_URLS_PER_CORE_DOMAIN = 1000
+MAX_URLS_PER_TOP_DOMAIN = 100
+MAX_URLS_PER_OTHER_DOMAIN = 5
+MAX_OTHER_DOMAINS = 10000
 
-def run(url_queue: Queue):
-    initialize_url_queue(url_queue)
-    while True:
-        update_url_queue(url_queue)
+@dataclass
+class URLScore:
+    url: str
+    score: float
+
+
+class URLQueue:
+    def __init__(self, new_item_queue: Queue, queued_batches: Queue, min_top_domains: int = 5):
+        """
+        new_item_queue: each item in the queue is a list of FoundURLs
+        queued_batches: each item in the queue is a list of URLs (strings)
+        """
+        self._new_item_queue = new_item_queue
+        self._queued_batches = queued_batches
+        self._other_urls = defaultdict(list)
+        self._top_urls = defaultdict(list)
+        self._min_top_domains = min_top_domains
+
+    def initialize(self):
+        with Database() as db:
+            url_db = URLDatabase(db.connection)
+            urls = url_db.get_urls(URLStatus.QUEUED, MAX_QUEUE_SIZE * BATCH_SIZE)
+            self._queue_urls(urls)
+            logger.info(f"Initialized URL queue with {len(urls)} urls, current queue size: {self.num_queued_batches}")
+
+    def update(self):
+        num_processed = 0
+        while True:
+            try:
+                new_batch = self._new_item_queue.get_nowait()
+                num_processed += 1
+            except Empty:
+                break
+            self._process_found_urls(new_batch)
+        return num_processed
+
+    def _process_found_urls(self, found_urls: list[FoundURL]):
+        min_updated_date = datetime.utcnow() - timedelta(hours=REASSIGN_MIN_HOURS)
+
+        logger.info(f"Found URLS: {len(found_urls)}")
+        valid_urls = [found_url for found_url in found_urls if found_url.status == URLStatus.NEW.value or (
+                found_url.status == URLStatus.ASSIGNED.value and found_url.timestamp < min_updated_date)]
+        logger.info(f"Valid URLs: {len(valid_urls)}")
+
+        self._sort_urls(valid_urls)
+        logger.info(f"Queue size: {self.num_queued_batches}")
+        while self.num_queued_batches < MAX_QUEUE_SIZE and len(self._top_urls) > self._min_top_domains:
+            total_top_urls = sum(len(urls) for urls in self._top_urls.values())
+            logger.info(f"Total top URLs stored: {total_top_urls}")
+
+            total_other_urls = sum(len(urls) for urls in self._other_urls.values())
+            logger.info(f"Total other URLs stored: {total_other_urls}")
+
+            self._batch_urls()
+            logger.info(f"Queue size after batching: {self.num_queued_batches}")
 
+    def _sort_urls(self, valid_urls: list[FoundURL]):
+        for found_url in valid_urls:
+            try:
+                domain = get_domain(found_url.url)
+            except ValueError:
+                continue
+            url_store = self._top_urls if domain in TOP_DOMAINS else self._other_urls
+            url_store[domain].append(URLScore(found_url.url, found_url.score))
 
-def update_url_queue(url_queue: Queue):
-    logger.info("Updating URL queue")
-    current_size = url_queue.qsize()
-    if current_size >= MIN_QUEUE_SIZE:
-        logger.info(f"Skipping queue update, current size {current_size}, sleeping for 10 seconds")
-        sleep(10)
-        return
+        logger.info(f"URL store updated: {len(self._top_urls)} top domains, {len(self._other_urls)} other domains")
 
-    with Database() as db:
-        url_db = URLDatabase(db.connection)
-        urls = url_db.get_urls_for_crawling()
-        queue_batches(url_queue, urls)
-        logger.info(f"Queued {len(urls)} urls, current queue size: {url_queue.qsize()}")
+        _sort_and_limit_urls(self._top_urls, MAX_TOP_URLS)
+        _sort_and_limit_urls(self._other_urls, MAX_OTHER_URLS)
 
+        # Keep only the top "other" domains, ranked by the top item for that domain
+        top_other_urls = sorted(self._other_urls.items(), key=lambda x: x[1][0].score, reverse=True)[:MAX_OTHER_DOMAINS]
+        self._other_urls = defaultdict(list, dict(top_other_urls))
 
-def initialize_url_queue(url_queue: Queue):
-    with Database() as db:
-        url_db = URLDatabase(db.connection)
-        urls = url_db.get_urls(URLStatus.QUEUED, MAX_QUEUE_SIZE * BATCH_SIZE)
-        queue_batches(url_queue, urls)
-        logger.info(f"Initialized URL queue with {len(urls)} urls, current queue size: {url_queue.qsize()}")
+    def _batch_urls(self):
+        urls = []
+        logger.info("Adding core domains")
+        _add_urls(CORE_DOMAINS, self._top_urls, urls, MAX_URLS_PER_CORE_DOMAIN)
+        logger.info("Adding top domains")
+        _add_urls(TOP_DOMAINS.keys() - CORE_DOMAINS, self._top_urls, urls, MAX_URLS_PER_TOP_DOMAIN)
+        logger.info("Adding other domains")
+        _add_urls(self._other_urls.keys(), self._other_urls, urls, MAX_URLS_PER_OTHER_DOMAIN)
+        self._queue_urls(urls)
+
+    def _queue_urls(self, valid_urls: list[str]):
+        for url_batch in batch(valid_urls, BATCH_SIZE):
+            self._queued_batches.put(url_batch, block=False)
+
+    @property
+    def num_queued_batches(self) -> int:
+        return self._queued_batches.qsize()
+
+    @property
+    def num_top_domains(self) -> int:
+        return len(self._top_urls)
+
+
+def _sort_and_limit_urls(domain_urls: dict[str, list[str]], max_urls: int):
+    for domain, urls in domain_urls.items():
+        domain_urls[domain] = sorted(urls, key=lambda url_score: url_score.score, reverse=True)[:max_urls]
+
+
+def _add_urls(domains: Union[set[str], KeysView], domain_urls: dict[str, list[URLScore]], urls: list[str], max_urls: int):
+    for domain in list(domains & domain_urls.keys()):
+        new_urls = domain_urls[domain][:max_urls]
+        urls += [url_score.url for url_score in new_urls]
+        new_domain_urls = domain_urls[domain][max_urls:]
+        if len(new_domain_urls) > 0:
+            domain_urls[domain] = new_domain_urls
+        else:
+            del domain_urls[domain]
+
+
+def update_queue_continuously(new_item_queue: Queue, queued_batches: Queue):
+    queue = URLQueue(new_item_queue, queued_batches)
+    queue.initialize()
+    while True:
+        num_processed = queue.update()
+        logger.info(f"Queue update, num processed: {num_processed}, queue size: {queue.num_queued_batches}, num top "
+                    f"domains: {queue.num_top_domains}")
+        if num_processed == 0:
+            time.sleep(5)
 
 
-def queue_batches(url_queue, urls):
-    for url_batch in batch(urls, BATCH_SIZE):
-        url_queue.put(url_batch, block=False)

+ 12 - 0
mwmbl/utils.py

@@ -1,3 +1,8 @@
+import re
+
+DOMAIN_REGEX = re.compile(r".*://([^/]*)")
+
+
 def batch(items: list, batch_size):
     """
     Adapted from https://stackoverflow.com/a/8290508
@@ -5,3 +10,10 @@ def batch(items: list, batch_size):
     length = len(items)
     for ndx in range(0, length, batch_size):
         yield items[ndx:min(ndx + batch_size, length)]
+
+
+def get_domain(url):
+    results = DOMAIN_REGEX.match(url)
+    if results is None or len(results.groups()) == 0:
+        raise ValueError(f"Unable to parse domain from URL {url}")
+    return results.group(1)

+ 19 - 0
test/test_url_queue.py

@@ -0,0 +1,19 @@
+from datetime import datetime
+from queue import Queue
+
+from mwmbl.crawler.urls import FoundURL, URLStatus
+from mwmbl.url_queue import URLQueue
+
+
+def test_url_queue_empties():
+    new_item_queue = Queue()
+    queued_batches = Queue()
+
+    url_queue = URLQueue(new_item_queue, queued_batches, min_top_domains=0)
+    new_item_queue.put([FoundURL("https://google.com", "123", 10.0, URLStatus.NEW.value, datetime(2023, 1, 19))])
+
+    url_queue.update()
+
+    items = queued_batches.get(block=False)
+
+    assert items == ["https://google.com"]