|
@@ -1,20 +1,17 @@
|
|
"""
|
|
"""
|
|
Database storing info on URLs
|
|
Database storing info on URLs
|
|
"""
|
|
"""
|
|
-import os
|
|
|
|
from dataclasses import dataclass
|
|
from dataclasses import dataclass
|
|
from datetime import datetime, timedelta
|
|
from datetime import datetime, timedelta
|
|
from enum import Enum
|
|
from enum import Enum
|
|
-from typing import Iterable
|
|
|
|
|
|
|
|
-from psycopg2 import connect
|
|
|
|
from psycopg2.extras import execute_values
|
|
from psycopg2.extras import execute_values
|
|
|
|
|
|
|
|
|
|
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
|
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
|
-from mwmbl.database import Database
|
|
|
|
|
|
+from mwmbl.utils import batch
|
|
|
|
|
|
-REASSIGN_MIN_HOURS = 1
|
|
|
|
|
|
+REASSIGN_MIN_HOURS = 5
|
|
BATCH_SIZE = 100
|
|
BATCH_SIZE = 100
|
|
|
|
|
|
|
|
|
|
@@ -23,6 +20,7 @@ class URLStatus(Enum):
|
|
URL state update is idempotent and can only progress forwards.
|
|
URL state update is idempotent and can only progress forwards.
|
|
"""
|
|
"""
|
|
NEW = 0 # One user has identified this URL
|
|
NEW = 0 # One user has identified this URL
|
|
|
|
+ QUEUED = 5 # The URL has been queued for crawling
|
|
ASSIGNED = 10 # The crawler has given the URL to a user to crawl
|
|
ASSIGNED = 10 # The crawler has given the URL to a user to crawl
|
|
ERROR_TIMEOUT = 20 # Timeout while retrieving
|
|
ERROR_TIMEOUT = 20 # Timeout while retrieving
|
|
ERROR_404 = 30 # 404 response
|
|
ERROR_404 = 30 # 404 response
|
|
@@ -31,15 +29,6 @@ class URLStatus(Enum):
|
|
CRAWLED = 100 # At least one user has crawled the URL
|
|
CRAWLED = 100 # At least one user has crawled the URL
|
|
|
|
|
|
|
|
|
|
-def batch(items: list, batch_size):
|
|
|
|
- """
|
|
|
|
- Adapted from https://stackoverflow.com/a/8290508
|
|
|
|
- """
|
|
|
|
- length = len(items)
|
|
|
|
- for ndx in range(0, length, batch_size):
|
|
|
|
- yield items[ndx:min(ndx + batch_size, length)]
|
|
|
|
-
|
|
|
|
-
|
|
|
|
@dataclass
|
|
@dataclass
|
|
class FoundURL:
|
|
class FoundURL:
|
|
url: str
|
|
url: str
|
|
@@ -119,16 +108,16 @@ class URLDatabase:
|
|
|
|
|
|
execute_values(cursor, insert_sql, data)
|
|
execute_values(cursor, insert_sql, data)
|
|
|
|
|
|
- def get_new_batch_for_user(self, user_id_hash: str):
|
|
|
|
|
|
+ def get_urls_for_crawling(self, num_urls: int):
|
|
sql = f"""
|
|
sql = f"""
|
|
- UPDATE urls SET status = {URLStatus.ASSIGNED.value}, user_id_hash = %(user_id_hash)s, updated = %(now)s
|
|
|
|
|
|
+ UPDATE urls SET status = {URLStatus.QUEUED.value}, updated = %(now)s
|
|
WHERE url IN (
|
|
WHERE url IN (
|
|
SELECT url FROM urls
|
|
SELECT url FROM urls
|
|
- WHERE status = {URLStatus.NEW.value} OR (
|
|
|
|
|
|
+ WHERE status IN ({URLStatus.NEW.value}) OR (
|
|
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
|
status = {URLStatus.ASSIGNED.value} AND updated < %(min_updated_date)s
|
|
)
|
|
)
|
|
ORDER BY score DESC
|
|
ORDER BY score DESC
|
|
- LIMIT {BATCH_SIZE}
|
|
|
|
|
|
+ LIMIT %(num_urls)s
|
|
FOR UPDATE SKIP LOCKED
|
|
FOR UPDATE SKIP LOCKED
|
|
)
|
|
)
|
|
RETURNING url
|
|
RETURNING url
|
|
@@ -137,11 +126,27 @@ class URLDatabase:
|
|
now = datetime.utcnow()
|
|
now = datetime.utcnow()
|
|
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
|
min_updated_date = now - timedelta(hours=REASSIGN_MIN_HOURS)
|
|
with self.connection.cursor() as cursor:
|
|
with self.connection.cursor() as cursor:
|
|
- cursor.execute(sql, {'user_id_hash': user_id_hash, 'min_updated_date': min_updated_date, 'now': now})
|
|
|
|
|
|
+ cursor.execute(sql, {'min_updated_date': min_updated_date, 'now': now, 'num_urls': num_urls})
|
|
|
|
+ results = cursor.fetchall()
|
|
|
|
+
|
|
|
|
+ return [result[0] for result in results]
|
|
|
|
+
|
|
|
|
+ def get_urls(self, status: URLStatus, num_urls: int):
|
|
|
|
+ sql = f"""
|
|
|
|
+ SELECT url FROM urls
|
|
|
|
+ WHERE status = %(status)s
|
|
|
|
+ ORDER BY score DESC
|
|
|
|
+ LIMIT %(num_urls)s
|
|
|
|
+ """
|
|
|
|
+
|
|
|
|
+ with self.connection.cursor() as cursor:
|
|
|
|
+ cursor.execute(sql, {'status': status.value, 'num_urls': num_urls})
|
|
results = cursor.fetchall()
|
|
results = cursor.fetchall()
|
|
|
|
|
|
return [result[0] for result in results]
|
|
return [result[0] for result in results]
|
|
|
|
|
|
|
|
+
|
|
|
|
+
|
|
def get_url_scores(self, urls: list[str]) -> dict[str, float]:
|
|
def get_url_scores(self, urls: list[str]) -> dict[str, float]:
|
|
sql = f"""
|
|
sql = f"""
|
|
SELECT url, score FROM urls WHERE url IN %(urls)s
|
|
SELECT url, score FROM urls WHERE url IN %(urls)s
|
|
@@ -155,15 +160,3 @@ class URLDatabase:
|
|
url_scores.update({result[0]: result[1] for result in results})
|
|
url_scores.update({result[0]: result[1] for result in results})
|
|
|
|
|
|
return url_scores
|
|
return url_scores
|
|
-
|
|
|
|
-
|
|
|
|
-if __name__ == "__main__":
|
|
|
|
- with Database() as db:
|
|
|
|
- url_db = URLDatabase(db.connection)
|
|
|
|
- url_db.create_tables()
|
|
|
|
- # update_url_status(conn, [URLStatus("https://mwmbl.org", URLState.NEW, "test-user", datetime.now())])
|
|
|
|
- # url_db.user_found_urls("Test user", ["a", "b", "c"], datetime.utcnow())
|
|
|
|
- # url_db.user_found_urls("Another user", ["b", "c", "d"], datetime.utcnow())
|
|
|
|
- # url_db.user_crawled_urls("Test user", ["c"], datetime.utcnow())
|
|
|
|
- batch = url_db.get_new_batch_for_user('test user 4')
|
|
|
|
- print("Batch", len(batch), batch)
|
|
|