Use different scores for same domain links
This commit is contained in:
parent
36b168a8f6
commit
ff2312a5ca
6 changed files with 39 additions and 17 deletions
|
@ -1,7 +1,7 @@
|
|||
import json
|
||||
|
||||
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
|
||||
from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
|
||||
|
||||
def export_top_domains_to_json():
|
||||
|
|
|
@ -3,6 +3,7 @@ import hashlib
|
|||
import json
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Union
|
||||
from urllib.parse import urlparse
|
||||
|
@ -13,8 +14,9 @@ import requests
|
|||
from fastapi import HTTPException, APIRouter
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase
|
||||
from mwmbl.crawler.urls import URLDatabase, FoundURL
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
|
||||
APPLICATION_KEY = os.environ['MWMBL_APPLICATION_KEY']
|
||||
KEY_ID = os.environ['MWMBL_KEY_ID']
|
||||
|
@ -28,6 +30,10 @@ DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
|
|||
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
|
||||
FILE_NAME_SUFFIX = '.json.gz'
|
||||
|
||||
SCORE_FOR_ROOT_PATH = 0.1
|
||||
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
||||
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||
|
||||
|
||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
|
@ -124,21 +130,32 @@ def create_historical_batch(batch: HashedBatch):
|
|||
def _record_urls_in_database(batch: Union[Batch, HashedBatch], user_id_hash: str, timestamp: datetime):
|
||||
with Database() as db:
|
||||
url_db = URLDatabase(db.connection)
|
||||
found_urls = set()
|
||||
url_scores = defaultdict(float)
|
||||
for item in batch.items:
|
||||
if item.content is not None:
|
||||
found_urls |= set(item.content.links)
|
||||
crawled_page_domain = urlparse(item.url).netloc
|
||||
if crawled_page_domain not in DOMAINS:
|
||||
continue
|
||||
|
||||
parsed_urls = [urlparse(url) for url in found_urls]
|
||||
domains = {f'{p.scheme}://{p.netloc}' for p in parsed_urls}
|
||||
found_urls |= domains
|
||||
for link in item.content.links:
|
||||
parsed_link = urlparse(link)
|
||||
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
||||
url_scores[link] += score
|
||||
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
||||
url_scores[domain] += SCORE_FOR_ROOT_PATH
|
||||
|
||||
found_urls = [FoundURL(url, user_id_hash, score, item.timestamp) for url, score in url_scores.items()]
|
||||
if len(found_urls) > 0:
|
||||
url_db.user_found_urls(user_id_hash, list(found_urls), timestamp)
|
||||
url_db.update_found_urls(found_urls)
|
||||
|
||||
crawled_urls = [item.url for item in batch.items]
|
||||
url_db.user_crawled_urls(user_id_hash, crawled_urls, timestamp)
|
||||
|
||||
# TODO:
|
||||
# - test this code
|
||||
# - delete existing crawl data for change from INT to FLOAT
|
||||
# - load some historical data as a starting point
|
||||
|
||||
|
||||
@router.get('/batches/{date_str}/users/{public_user_id}')
|
||||
def get_batches_for_date_and_user(date_str, public_user_id):
|
||||
|
|
|
@ -27,6 +27,14 @@ class URLStatus(Enum):
|
|||
CRAWLED = 3 # At least one user has crawled the URL
|
||||
|
||||
|
||||
@dataclass
|
||||
class FoundURL:
|
||||
url: str
|
||||
user_id_hash: str
|
||||
score: float
|
||||
timestamp: datetime
|
||||
|
||||
|
||||
class URLDatabase:
|
||||
def __init__(self, connection):
|
||||
self.connection = connection
|
||||
|
@ -45,7 +53,7 @@ class URLDatabase:
|
|||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql)
|
||||
|
||||
def user_found_urls(self, user_id_hash: str, urls: list[str], timestamp: datetime):
|
||||
def update_found_urls(self, found_urls: list[FoundURL]):
|
||||
sql = f"""
|
||||
INSERT INTO urls (url, status, user_id_hash, score, updated) values %s
|
||||
ON CONFLICT (url) DO UPDATE SET
|
||||
|
@ -63,14 +71,15 @@ class URLDatabase:
|
|||
user_id_hash = CASE
|
||||
WHEN urls.status={URLStatus.ASSIGNED.value} THEN urls.user_id_hash ELSE excluded.user_id_hash
|
||||
END,
|
||||
score=urls.score + 1,
|
||||
score=urls.score + excluded.score,
|
||||
updated=excluded.updated
|
||||
updated = CASE
|
||||
WHEN urls.status={URLStatus.ASSIGNED.value} THEN urls.updated ELSE excluded.updated
|
||||
END
|
||||
"""
|
||||
|
||||
data = [(url, URLStatus.NEW.value, user_id_hash, 1, timestamp) for url in urls]
|
||||
data = [(found_url.url, URLStatus.NEW.value, found_url.user_id_hash, found_url.score, found_url.timestamp)
|
||||
for found_url in found_urls]
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
execute_values(cursor, sql, data)
|
||||
|
|
|
@ -11,7 +11,7 @@ import pandas as pd
|
|||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
|
||||
TOP_DOMAINS_PATH = '../tinysearchengine/hn_top_domains_filtered.py'
|
||||
TOP_DOMAINS_PATH = '../hn_top_domains_filtered.py'
|
||||
|
||||
MIN_COUNT = 10
|
||||
PROBABILITY_THRESHOLD = 0.8
|
||||
|
|
|
@ -2,14 +2,10 @@ import re
|
|||
from abc import abstractmethod
|
||||
from logging import getLogger
|
||||
from operator import itemgetter
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from fastapi import FastAPI
|
||||
from starlette.middleware.cors import CORSMiddleware
|
||||
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
|
Loading…
Add table
Reference in a new issue