Use different scores for same domain links

This commit is contained in:
Daoud Clarke 2022-06-27 22:46:06 +01:00
parent 36b168a8f6
commit ff2312a5ca
6 changed files with 39 additions and 17 deletions

View file

@ -1,7 +1,7 @@
import json
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
from mwmbl.hn_top_domains_filtered import DOMAINS
def export_top_domains_to_json():

View file

@ -3,6 +3,7 @@ import hashlib
import json
import os
import re
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from typing import Union
from urllib.parse import urlparse
@ -13,8 +14,9 @@ import requests
from fastapi import HTTPException, APIRouter
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.crawler.urls import URLDatabase, FoundURL
from mwmbl.database import Database
from mwmbl.hn_top_domains_filtered import DOMAINS
APPLICATION_KEY = os.environ['MWMBL_APPLICATION_KEY']
KEY_ID = os.environ['MWMBL_KEY_ID']
@ -28,6 +30,10 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
FILE_NAME_SUFFIX = '.json.gz'
SCORE_FOR_ROOT_PATH = 0.1
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
SCORE_FOR_SAME_DOMAIN = 0.01
router = APIRouter(prefix="/crawler", tags=["crawler"])
@ -124,21 +130,32 @@ def create_historical_batch(batch: HashedBatch):
def _record_urls_in_database(batch: Union[Batch, HashedBatch], user_id_hash: str, timestamp: datetime):
with Database() as db:
url_db = URLDatabase(db.connection)
found_urls = set()
url_scores = defaultdict(float)
for item in batch.items:
if item.content is not None:
found_urls |= set(item.content.links)
crawled_page_domain = urlparse(item.url).netloc
if crawled_page_domain not in DOMAINS:
continue
parsed_urls = [urlparse(url) for url in found_urls]
domains = {f'{p.scheme}://{p.netloc}' for p in parsed_urls}
found_urls |= domains
for link in item.content.links:
parsed_link = urlparse(link)
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
url_scores[link] += score
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
url_scores[domain] += SCORE_FOR_ROOT_PATH
found_urls = [FoundURL(url, user_id_hash, score, item.timestamp) for url, score in url_scores.items()]
if len(found_urls) > 0:
url_db.user_found_urls(user_id_hash, list(found_urls), timestamp)
url_db.update_found_urls(found_urls)
crawled_urls = [item.url for item in batch.items]
url_db.user_crawled_urls(user_id_hash, crawled_urls, timestamp)
# TODO:
# - test this code
# - delete existing crawl data for change from INT to FLOAT
# - load some historical data as a starting point
@router.get('/batches/{date_str}/users/{public_user_id}')
def get_batches_for_date_and_user(date_str, public_user_id):

View file

@ -27,6 +27,14 @@ class URLStatus(Enum):
CRAWLED = 3 # At least one user has crawled the URL
@dataclass
class FoundURL:
url: str
user_id_hash: str
score: float
timestamp: datetime
class URLDatabase:
def __init__(self, connection):
self.connection = connection
@ -45,7 +53,7 @@ class URLDatabase:
with self.connection.cursor() as cursor:
cursor.execute(sql)
def user_found_urls(self, user_id_hash: str, urls: list[str], timestamp: datetime):
def update_found_urls(self, found_urls: list[FoundURL]):
sql = f"""
INSERT INTO urls (url, status, user_id_hash, score, updated) values %s
ON CONFLICT (url) DO UPDATE SET
@ -63,14 +71,15 @@ class URLDatabase:
user_id_hash = CASE
WHEN urls.status={URLStatus.ASSIGNED.value} THEN urls.user_id_hash ELSE excluded.user_id_hash
END,
score=urls.score + 1,
score=urls.score + excluded.score,
updated=excluded.updated
updated = CASE
WHEN urls.status={URLStatus.ASSIGNED.value} THEN urls.updated ELSE excluded.updated
END
"""
data = [(url, URLStatus.NEW.value, user_id_hash, 1, timestamp) for url in urls]
data = [(found_url.url, URLStatus.NEW.value, found_url.user_id_hash, found_url.score, found_url.timestamp)
for found_url in found_urls]
with self.connection.cursor() as cursor:
execute_values(cursor, sql, data)

View file

@ -11,7 +11,7 @@ import pandas as pd
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
TOP_DOMAINS_PATH = '../tinysearchengine/hn_top_domains_filtered.py'
TOP_DOMAINS_PATH = '../hn_top_domains_filtered.py'
MIN_COUNT = 10
PROBABILITY_THRESHOLD = 0.8

View file

@ -2,14 +2,10 @@ import re
from abc import abstractmethod
from logging import getLogger
from operator import itemgetter
from pathlib import Path
from urllib.parse import urlparse
from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
logger = getLogger(__name__)