|
@@ -1,10 +1,8 @@
|
|
|
import gzip
|
|
|
import hashlib
|
|
|
import json
|
|
|
-from collections import defaultdict
|
|
|
-from datetime import datetime, timezone, timedelta, date
|
|
|
+from datetime import datetime, timezone, date
|
|
|
from typing import Union
|
|
|
-from urllib.parse import urlparse
|
|
|
from uuid import uuid4
|
|
|
|
|
|
import boto3
|
|
@@ -12,9 +10,9 @@ import requests
|
|
|
from fastapi import HTTPException, APIRouter
|
|
|
|
|
|
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
|
|
-from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
|
|
+from mwmbl.crawler.urls import URLDatabase
|
|
|
from mwmbl.database import Database
|
|
|
-from mwmbl.hn_top_domains_filtered import DOMAINS
|
|
|
+from mwmbl.indexer.batch_cache import BatchCache
|
|
|
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
|
|
|
from mwmbl.settings import (
|
|
|
ENDPOINT_URL,
|
|
@@ -25,18 +23,11 @@ from mwmbl.settings import (
|
|
|
USER_ID_LENGTH,
|
|
|
VERSION,
|
|
|
PUBLIC_URL_PREFIX,
|
|
|
- UNKNOWN_DOMAIN_MULTIPLIER,
|
|
|
- SCORE_FOR_SAME_DOMAIN,
|
|
|
- SCORE_FOR_DIFFERENT_DOMAIN,
|
|
|
- SCORE_FOR_ROOT_PATH,
|
|
|
PUBLIC_USER_ID_LENGTH,
|
|
|
FILE_NAME_SUFFIX,
|
|
|
DATE_REGEX)
|
|
|
from mwmbl.tinysearchengine.indexer import Document
|
|
|
|
|
|
-router = APIRouter(prefix="/crawler", tags=["crawler"])
|
|
|
-
|
|
|
-
|
|
|
def get_bucket(name):
|
|
|
s3 = boto3.resource('s3', endpoint_url=ENDPOINT_URL, aws_access_key_id=KEY_ID,
|
|
|
aws_secret_access_key=APPLICATION_KEY)
|
|
@@ -52,150 +43,111 @@ def upload(data: bytes, name: str):
|
|
|
last_batch = None
|
|
|
|
|
|
|
|
|
-@router.on_event("startup")
|
|
|
-async def on_startup():
|
|
|
- with Database() as db:
|
|
|
- url_db = URLDatabase(db.connection)
|
|
|
- return url_db.create_tables()
|
|
|
-
|
|
|
-
|
|
|
-@router.post('/batches/')
|
|
|
-def create_batch(batch: Batch):
|
|
|
- if len(batch.items) > MAX_BATCH_SIZE:
|
|
|
- raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
|
|
|
-
|
|
|
- if len(batch.user_id) != USER_ID_LENGTH:
|
|
|
- raise HTTPException(400, f"User ID length is incorrect, should be {USER_ID_LENGTH} characters")
|
|
|
-
|
|
|
- if len(batch.items) == 0:
|
|
|
- return {
|
|
|
- 'status': 'ok',
|
|
|
- }
|
|
|
-
|
|
|
- user_id_hash = _get_user_id_hash(batch)
|
|
|
-
|
|
|
- now = datetime.now(timezone.utc)
|
|
|
- seconds = (now - datetime(now.year, now.month, now.day, tzinfo=timezone.utc)).seconds
|
|
|
-
|
|
|
- # How to pad a string with zeros: https://stackoverflow.com/a/39402910
|
|
|
- # Maximum seconds in a day is 60*60*24 = 86400, so 5 digits is enough
|
|
|
- padded_seconds = str(seconds).zfill(5)
|
|
|
-
|
|
|
- # See discussion here: https://stackoverflow.com/a/13484764
|
|
|
- uid = str(uuid4())[:8]
|
|
|
- filename = f'1/{VERSION}/{now.date()}/1/{user_id_hash}/{padded_seconds}__{uid}.json.gz'
|
|
|
-
|
|
|
- # Using an approach from https://stackoverflow.com/a/30476450
|
|
|
- epoch_time = (now - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds()
|
|
|
- hashed_batch = HashedBatch(user_id_hash=user_id_hash, timestamp=epoch_time, items=batch.items)
|
|
|
- data = gzip.compress(hashed_batch.json().encode('utf8'))
|
|
|
- upload(data, filename)
|
|
|
-
|
|
|
- record_urls_in_database(batch, user_id_hash, now)
|
|
|
- queue_batch(hashed_batch)
|
|
|
+def get_router(batch_cache: BatchCache):
|
|
|
+ router = APIRouter(prefix="/crawler", tags=["crawler"])
|
|
|
|
|
|
- global last_batch
|
|
|
- last_batch = hashed_batch
|
|
|
+ @router.on_event("startup")
|
|
|
+ async def on_startup():
|
|
|
+ with Database() as db:
|
|
|
+ url_db = URLDatabase(db.connection)
|
|
|
+ return url_db.create_tables()
|
|
|
|
|
|
- # Record the batch as being local so that we don't retrieve it again when the server restarts
|
|
|
- batch_url = f'{PUBLIC_URL_PREFIX}{filename}'
|
|
|
- infos = [BatchInfo(batch_url, user_id_hash, BatchStatus.LOCAL)]
|
|
|
+ @router.post('/batches/')
|
|
|
+ def create_batch(batch: Batch):
|
|
|
+ if len(batch.items) > MAX_BATCH_SIZE:
|
|
|
+ raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
|
|
|
|
|
|
- with Database() as db:
|
|
|
- index_db = IndexDatabase(db.connection)
|
|
|
- index_db.record_batches(infos)
|
|
|
-
|
|
|
- return {
|
|
|
- 'status': 'ok',
|
|
|
- 'public_user_id': user_id_hash,
|
|
|
- 'url': batch_url,
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
-def _get_user_id_hash(batch: Union[Batch, NewBatchRequest]):
|
|
|
- return hashlib.sha3_256(batch.user_id.encode('utf8')).hexdigest()
|
|
|
+ if len(batch.user_id) != USER_ID_LENGTH:
|
|
|
+ raise HTTPException(400, f"User ID length is incorrect, should be {USER_ID_LENGTH} characters")
|
|
|
|
|
|
+ if len(batch.items) == 0:
|
|
|
+ return {
|
|
|
+ 'status': 'ok',
|
|
|
+ }
|
|
|
|
|
|
-@router.post('/batches/new')
|
|
|
-def request_new_batch(batch_request: NewBatchRequest):
|
|
|
- user_id_hash = _get_user_id_hash(batch_request)
|
|
|
+ user_id_hash = _get_user_id_hash(batch)
|
|
|
|
|
|
- with Database() as db:
|
|
|
- url_db = URLDatabase(db.connection)
|
|
|
- return url_db.get_new_batch_for_user(user_id_hash)
|
|
|
+ now = datetime.now(timezone.utc)
|
|
|
+ seconds = (now - datetime(now.year, now.month, now.day, tzinfo=timezone.utc)).seconds
|
|
|
|
|
|
+ # How to pad a string with zeros: https://stackoverflow.com/a/39402910
|
|
|
+ # Maximum seconds in a day is 60*60*24 = 86400, so 5 digits is enough
|
|
|
+ padded_seconds = str(seconds).zfill(5)
|
|
|
|
|
|
-@router.post('/batches/historical')
|
|
|
-def create_historical_batch(batch: HashedBatch):
|
|
|
- """
|
|
|
- Update the database state of URL crawling for old data
|
|
|
- """
|
|
|
- user_id_hash = batch.user_id_hash
|
|
|
- batch_datetime = get_datetime_from_timestamp(batch.timestamp)
|
|
|
- record_urls_in_database(batch, user_id_hash, batch_datetime)
|
|
|
+ # See discussion here: https://stackoverflow.com/a/13484764
|
|
|
+ uid = str(uuid4())[:8]
|
|
|
+ filename = f'1/{VERSION}/{now.date()}/1/{user_id_hash}/{padded_seconds}__{uid}.json.gz'
|
|
|
|
|
|
+ # Using an approach from https://stackoverflow.com/a/30476450
|
|
|
+ epoch_time = (now - datetime(1970, 1, 1, tzinfo=timezone.utc)).total_seconds()
|
|
|
+ hashed_batch = HashedBatch(user_id_hash=user_id_hash, timestamp=epoch_time, items=batch.items)
|
|
|
+ data = gzip.compress(hashed_batch.json().encode('utf8'))
|
|
|
+ upload(data, filename)
|
|
|
|
|
|
-def get_datetime_from_timestamp(timestamp: int) -> datetime:
|
|
|
- batch_datetime = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=timestamp)
|
|
|
- return batch_datetime
|
|
|
+ global last_batch
|
|
|
+ last_batch = hashed_batch
|
|
|
|
|
|
+ batch_url = f'{PUBLIC_URL_PREFIX}{filename}'
|
|
|
+ batch_cache.store(hashed_batch, batch_url)
|
|
|
|
|
|
-def record_urls_in_database(batch: Union[Batch, HashedBatch], user_id_hash: str, timestamp: datetime):
|
|
|
- with Database() as db:
|
|
|
- url_db = URLDatabase(db.connection)
|
|
|
- url_scores = defaultdict(float)
|
|
|
- for item in batch.items:
|
|
|
- if item.content is not None:
|
|
|
- crawled_page_domain = urlparse(item.url).netloc
|
|
|
- score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
|
|
- for link in item.content.links:
|
|
|
- parsed_link = urlparse(link)
|
|
|
- score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
|
|
- url_scores[link] += score * score_multiplier
|
|
|
- domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
|
|
- url_scores[domain] += SCORE_FOR_ROOT_PATH * score_multiplier
|
|
|
+ # Record the batch as being local so that we don't retrieve it again when the server restarts
|
|
|
+ infos = [BatchInfo(batch_url, user_id_hash, BatchStatus.LOCAL)]
|
|
|
|
|
|
- found_urls = [FoundURL(url, user_id_hash, score, URLStatus.NEW, timestamp) for url, score in url_scores.items()]
|
|
|
- if len(found_urls) > 0:
|
|
|
- url_db.update_found_urls(found_urls)
|
|
|
+ with Database() as db:
|
|
|
+ index_db = IndexDatabase(db.connection)
|
|
|
+ index_db.record_batches(infos)
|
|
|
|
|
|
- crawled_urls = [FoundURL(item.url, user_id_hash, 0.0, URLStatus.CRAWLED, timestamp)
|
|
|
- for item in batch.items]
|
|
|
- url_db.update_found_urls(crawled_urls)
|
|
|
+ return {
|
|
|
+ 'status': 'ok',
|
|
|
+ 'public_user_id': user_id_hash,
|
|
|
+ 'url': batch_url,
|
|
|
+ }
|
|
|
|
|
|
+ @router.post('/batches/new')
|
|
|
+ def request_new_batch(batch_request: NewBatchRequest):
|
|
|
+ user_id_hash = _get_user_id_hash(batch_request)
|
|
|
+
|
|
|
+ with Database() as db:
|
|
|
+ url_db = URLDatabase(db.connection)
|
|
|
+ return url_db.get_new_batch_for_user(user_id_hash)
|
|
|
+
|
|
|
+ @router.get('/batches/{date_str}/users/{public_user_id}')
|
|
|
+ def get_batches_for_date_and_user(date_str, public_user_id):
|
|
|
+ check_date_str(date_str)
|
|
|
+ check_public_user_id(public_user_id)
|
|
|
+ prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
|
|
|
+ return get_batch_ids_for_prefix(prefix)
|
|
|
+
|
|
|
+ @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
|
|
|
+ def get_batch_from_id(date_str, public_user_id, batch_id):
|
|
|
+ url = get_batch_url(batch_id, date_str, public_user_id)
|
|
|
+ data = json.loads(gzip.decompress(requests.get(url).content))
|
|
|
+ return {
|
|
|
+ 'url': url,
|
|
|
+ 'batch': data,
|
|
|
+ }
|
|
|
|
|
|
-def get_batches_for_date(date_str):
|
|
|
- check_date_str(date_str)
|
|
|
- prefix = f'1/{VERSION}/{date_str}/1/'
|
|
|
- cache_filename = prefix + 'batches.json.gz'
|
|
|
- cache_url = PUBLIC_URL_PREFIX + cache_filename
|
|
|
- try:
|
|
|
- cached_batches = json.loads(gzip.decompress(requests.get(cache_url).content))
|
|
|
- print(f"Got cached batches for {date_str}")
|
|
|
- return cached_batches
|
|
|
- except gzip.BadGzipFile:
|
|
|
- pass
|
|
|
+ @router.get('/latest-batch', response_model=list[HashedBatch])
|
|
|
+ def get_latest_batch():
|
|
|
+ return [] if last_batch is None else [last_batch]
|
|
|
|
|
|
- batches = get_batches_for_prefix(prefix)
|
|
|
- result = {'batch_urls': [f'{PUBLIC_URL_PREFIX}{batch}' for batch in sorted(batches)]}
|
|
|
- if date_str != str(date.today()):
|
|
|
- # Don't cache data from today since it may change
|
|
|
- data = gzip.compress(json.dumps(result).encode('utf8'))
|
|
|
- upload(data, cache_filename)
|
|
|
- print(f"Cached batches for {date_str} in {PUBLIC_URL_PREFIX}{cache_filename}")
|
|
|
- return result
|
|
|
+ @router.get('/batches/{date_str}/users')
|
|
|
+ def get_user_id_hashes_for_date(date_str: str):
|
|
|
+ check_date_str(date_str)
|
|
|
+ prefix = f'1/{VERSION}/{date_str}/1/'
|
|
|
+ return get_subfolders(prefix)
|
|
|
|
|
|
+ @router.get('/')
|
|
|
+ def status():
|
|
|
+ return {
|
|
|
+ 'status': 'ok'
|
|
|
+ }
|
|
|
|
|
|
-def get_user_id_hash_from_url(url):
|
|
|
- return url.split('/')[9]
|
|
|
+ return router
|
|
|
|
|
|
|
|
|
-@router.get('/batches/{date_str}/users/{public_user_id}')
|
|
|
-def get_batches_for_date_and_user(date_str, public_user_id):
|
|
|
- check_date_str(date_str)
|
|
|
- check_public_user_id(public_user_id)
|
|
|
- prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
|
|
|
- return get_batch_ids_for_prefix(prefix)
|
|
|
+def _get_user_id_hash(batch: Union[Batch, NewBatchRequest]):
|
|
|
+ return hashlib.sha3_256(batch.user_id.encode('utf8')).hexdigest()
|
|
|
|
|
|
|
|
|
def check_public_user_id(public_user_id):
|
|
@@ -203,16 +155,6 @@ def check_public_user_id(public_user_id):
|
|
|
raise HTTPException(400, f"Incorrect public user ID length, should be {PUBLIC_USER_ID_LENGTH}")
|
|
|
|
|
|
|
|
|
-@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
|
|
|
-def get_batch_from_id(date_str, public_user_id, batch_id):
|
|
|
- url = get_batch_url(batch_id, date_str, public_user_id)
|
|
|
- data = json.loads(gzip.decompress(requests.get(url).content))
|
|
|
- return {
|
|
|
- 'url': url,
|
|
|
- 'batch': data,
|
|
|
- }
|
|
|
-
|
|
|
-
|
|
|
def get_batch_url(batch_id, date_str, public_user_id):
|
|
|
check_date_str(date_str)
|
|
|
check_public_user_id(public_user_id)
|
|
@@ -220,11 +162,6 @@ def get_batch_url(batch_id, date_str, public_user_id):
|
|
|
return url
|
|
|
|
|
|
|
|
|
-@router.get('/latest-batch', response_model=list[HashedBatch])
|
|
|
-def get_latest_batch():
|
|
|
- return [] if last_batch is None else [last_batch]
|
|
|
-
|
|
|
-
|
|
|
def get_batch_id_from_file_name(file_name: str):
|
|
|
assert file_name.endswith(FILE_NAME_SUFFIX)
|
|
|
return file_name[:-len(FILE_NAME_SUFFIX)]
|
|
@@ -246,13 +183,6 @@ def get_batches_for_prefix(prefix):
|
|
|
return filenames
|
|
|
|
|
|
|
|
|
-@router.get('/batches/{date_str}/users')
|
|
|
-def get_user_id_hashes_for_date(date_str: str):
|
|
|
- check_date_str(date_str)
|
|
|
- prefix = f'1/{VERSION}/{date_str}/1/'
|
|
|
- return get_subfolders(prefix)
|
|
|
-
|
|
|
-
|
|
|
def check_date_str(date_str):
|
|
|
if not DATE_REGEX.match(date_str):
|
|
|
raise HTTPException(400, f"Incorrect date format, should be YYYY-MM-DD")
|
|
@@ -268,17 +198,23 @@ def get_subfolders(prefix):
|
|
|
return item_keys
|
|
|
|
|
|
|
|
|
-@router.get('/')
|
|
|
-def status():
|
|
|
- return {
|
|
|
- 'status': 'ok'
|
|
|
- }
|
|
|
-
|
|
|
+def get_batches_for_date(date_str):
|
|
|
+ check_date_str(date_str)
|
|
|
+ prefix = f'1/{VERSION}/{date_str}/1/'
|
|
|
+ cache_filename = prefix + 'batches.json.gz'
|
|
|
+ cache_url = PUBLIC_URL_PREFIX + cache_filename
|
|
|
+ try:
|
|
|
+ cached_batches = json.loads(gzip.decompress(requests.get(cache_url).content))
|
|
|
+ print(f"Got cached batches for {date_str}")
|
|
|
+ return cached_batches
|
|
|
+ except gzip.BadGzipFile:
|
|
|
+ pass
|
|
|
|
|
|
-def queue_batch(batch: HashedBatch):
|
|
|
- # TODO: get the score from the URLs database
|
|
|
- documents = [Document(item.content.title, item.url, item.content.extract, 1)
|
|
|
- for item in batch.items if item.content is not None]
|
|
|
- with Database() as db:
|
|
|
- index_db = IndexDatabase(db.connection)
|
|
|
- index_db.queue_documents(documents)
|
|
|
+ batches = get_batches_for_prefix(prefix)
|
|
|
+ result = {'batch_urls': [f'{PUBLIC_URL_PREFIX}{batch}' for batch in sorted(batches)]}
|
|
|
+ if date_str != str(date.today()):
|
|
|
+ # Don't cache data from today since it may change
|
|
|
+ data = gzip.compress(json.dumps(result).encode('utf8'))
|
|
|
+ upload(data, cache_filename)
|
|
|
+ print(f"Cached batches for {date_str} in {PUBLIC_URL_PREFIX}{cache_filename}")
|
|
|
+ return result
|