Split out URL updating from indexing
This commit is contained in:
parent
f4fb9f831a
commit
cf253ae524
6 changed files with 119 additions and 77 deletions
|
@ -6,7 +6,7 @@ from multiprocessing import Queue
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
|
||||||
from mwmbl.indexer import index_batches, historical
|
from mwmbl.indexer import index_batches, historical, update_urls
|
||||||
from mwmbl.indexer.batch_cache import BatchCache
|
from mwmbl.indexer.batch_cache import BatchCache
|
||||||
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
|
from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME
|
||||||
from mwmbl.url_queue import update_url_queue, initialize_url_queue
|
from mwmbl.url_queue import update_url_queue, initialize_url_queue
|
||||||
|
@ -28,6 +28,10 @@ def run(data_path: str, url_queue: Queue):
|
||||||
batch_cache.retrieve_batches(num_batches=10000)
|
batch_cache.retrieve_batches(num_batches=10000)
|
||||||
except Exception:
|
except Exception:
|
||||||
logger.exception("Error retrieving batches")
|
logger.exception("Error retrieving batches")
|
||||||
|
try:
|
||||||
|
update_urls.run(batch_cache)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Error updating URLs")
|
||||||
try:
|
try:
|
||||||
index_batches.run(batch_cache, index_path)
|
index_batches.run(batch_cache, index_path)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
|
@ -2,32 +2,25 @@
|
||||||
Index batches that are stored locally.
|
Index batches that are stored locally.
|
||||||
"""
|
"""
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timezone, timedelta
|
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from typing import Iterable
|
from typing import Collection, Iterable
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
from mwmbl.indexer import process_batch
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
|
|
||||||
from mwmbl.crawler.batch import HashedBatch, Item
|
from mwmbl.crawler.batch import HashedBatch, Item
|
||||||
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
|
from mwmbl.crawler.urls import URLDatabase, URLStatus
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
|
||||||
from mwmbl.indexer.batch_cache import BatchCache
|
from mwmbl.indexer.batch_cache import BatchCache
|
||||||
from mwmbl.indexer.index import tokenize_document
|
from mwmbl.indexer.index import tokenize_document
|
||||||
from mwmbl.indexer.indexdb import BatchStatus, IndexDatabase
|
from mwmbl.indexer.indexdb import BatchStatus
|
||||||
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, SCORE_FOR_SAME_DOMAIN, SCORE_FOR_DIFFERENT_DOMAIN, \
|
|
||||||
SCORE_FOR_ROOT_PATH
|
|
||||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
from mwmbl.tinysearchengine.indexer import Document, TinyIndex
|
||||||
|
|
||||||
logger = getLogger(__name__)
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
EXCLUDED_DOMAINS = {'web.archive.org'}
|
def get_documents_from_batches(batches: Collection[HashedBatch]) -> Iterable[tuple[str, str, str]]:
|
||||||
|
|
||||||
|
|
||||||
def get_documents_from_batches(batches: Iterable[HashedBatch]) -> Iterable[tuple[str, str, str]]:
|
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
for item in batch.items:
|
for item in batch.items:
|
||||||
if item.content is not None:
|
if item.content is not None:
|
||||||
|
@ -35,28 +28,18 @@ def get_documents_from_batches(batches: Iterable[HashedBatch]) -> Iterable[tuple
|
||||||
|
|
||||||
|
|
||||||
def run(batch_cache: BatchCache, index_path: str):
|
def run(batch_cache: BatchCache, index_path: str):
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
with Database() as db:
|
|
||||||
index_db = IndexDatabase(db.connection)
|
|
||||||
|
|
||||||
logger.info("Getting local batches")
|
def process(batches: Collection[HashedBatch]):
|
||||||
batches = index_db.get_batches_by_status(BatchStatus.LOCAL, 10000)
|
with Database() as db:
|
||||||
logger.info(f"Got {len(batches)} batch urls")
|
nlp = spacy.load("en_core_web_sm")
|
||||||
if len(batches) == 0:
|
url_db = URLDatabase(db.connection)
|
||||||
return
|
index_batches(batches, index_path, nlp, url_db)
|
||||||
|
logger.info("Indexed pages")
|
||||||
|
|
||||||
batch_data = batch_cache.get_cached([batch.url for batch in batches])
|
process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)
|
||||||
logger.info(f"Got {len(batch_data)} cached batches")
|
|
||||||
|
|
||||||
record_urls_in_database(batch_data.values())
|
|
||||||
|
|
||||||
url_db = URLDatabase(db.connection)
|
|
||||||
index_batches(batch_data.values(), index_path, nlp, url_db)
|
|
||||||
logger.info("Indexed pages")
|
|
||||||
index_db.update_batch_status([batch.url for batch in batches], BatchStatus.INDEXED)
|
|
||||||
|
|
||||||
|
|
||||||
def index_batches(batch_data: Iterable[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
|
def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
|
||||||
document_tuples = list(get_documents_from_batches(batch_data))
|
document_tuples = list(get_documents_from_batches(batch_data))
|
||||||
urls = [url for title, url, extract in document_tuples]
|
urls = [url for title, url, extract in document_tuples]
|
||||||
logger.info(f"Got {len(urls)} document tuples")
|
logger.info(f"Got {len(urls)} document tuples")
|
||||||
|
@ -108,49 +91,6 @@ def get_url_error_status(item: Item):
|
||||||
return URLStatus.ERROR_OTHER
|
return URLStatus.ERROR_OTHER
|
||||||
|
|
||||||
|
|
||||||
def record_urls_in_database(batches: Iterable[HashedBatch]):
|
|
||||||
with Database() as db:
|
|
||||||
url_db = URLDatabase(db.connection)
|
|
||||||
url_scores = defaultdict(float)
|
|
||||||
url_users = {}
|
|
||||||
url_timestamps = {}
|
|
||||||
url_statuses = defaultdict(lambda: URLStatus.NEW)
|
|
||||||
for batch in batches:
|
|
||||||
for item in batch.items:
|
|
||||||
timestamp = get_datetime_from_timestamp(item.timestamp / 1000.0)
|
|
||||||
url_timestamps[item.url] = timestamp
|
|
||||||
url_users[item.url] = batch.user_id_hash
|
|
||||||
if item.content is None:
|
|
||||||
url_statuses[item.url] = get_url_error_status(item)
|
|
||||||
else:
|
|
||||||
url_statuses[item.url] = URLStatus.CRAWLED
|
|
||||||
crawled_page_domain = urlparse(item.url).netloc
|
|
||||||
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
|
||||||
for link in item.content.links:
|
|
||||||
if parsed_link.netloc in EXCLUDED_DOMAINS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
parsed_link = urlparse(link)
|
|
||||||
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
|
||||||
url_scores[link] += score * score_multiplier
|
|
||||||
url_users[link] = batch.user_id_hash
|
|
||||||
url_timestamps[link] = timestamp
|
|
||||||
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
|
||||||
url_scores[domain] += SCORE_FOR_ROOT_PATH * score_multiplier
|
|
||||||
url_users[domain] = batch.user_id_hash
|
|
||||||
url_timestamps[domain] = timestamp
|
|
||||||
|
|
||||||
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
|
||||||
for url in url_scores.keys() | url_statuses.keys()]
|
|
||||||
|
|
||||||
url_db.update_found_urls(found_urls)
|
|
||||||
|
|
||||||
|
|
||||||
def get_datetime_from_timestamp(timestamp: float) -> datetime:
|
|
||||||
batch_datetime = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=timestamp)
|
|
||||||
return batch_datetime
|
|
||||||
|
|
||||||
|
|
||||||
# TODO: clean unicode at some point
|
# TODO: clean unicode at some point
|
||||||
def clean_unicode(s: str) -> str:
|
def clean_unicode(s: str) -> str:
|
||||||
return s.encode('utf-8', 'ignore').decode('utf-8')
|
return s.encode('utf-8', 'ignore').decode('utf-8')
|
|
@ -8,9 +8,10 @@ from psycopg2.extras import execute_values
|
||||||
|
|
||||||
|
|
||||||
class BatchStatus(Enum):
|
class BatchStatus(Enum):
|
||||||
REMOTE = 0 # The batch only exists in long term storage
|
REMOTE = 0 # The batch only exists in long term storage
|
||||||
LOCAL = 1 # We have a copy of the batch locally in Postgresql
|
LOCAL = 10 # We have a copy of the batch locally in Postgresql
|
||||||
INDEXED = 2
|
URLS_UPDATED = 20 # We've updated URLs from the batch
|
||||||
|
INDEXED = 30 # The batch has been indexed
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
29
mwmbl/indexer/process_batch.py
Normal file
29
mwmbl/indexer/process_batch.py
Normal file
|
@ -0,0 +1,29 @@
|
||||||
|
from logging import getLogger
|
||||||
|
from typing import Callable, Collection
|
||||||
|
|
||||||
|
from mwmbl.crawler.batch import HashedBatch
|
||||||
|
from mwmbl.database import Database
|
||||||
|
from mwmbl.indexer.batch_cache import BatchCache
|
||||||
|
from mwmbl.indexer.indexdb import BatchStatus, IndexDatabase
|
||||||
|
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchStatus,
|
||||||
|
process: Callable[[Collection[HashedBatch]], None]):
|
||||||
|
|
||||||
|
with Database() as db:
|
||||||
|
index_db = IndexDatabase(db.connection)
|
||||||
|
|
||||||
|
logger.info(f"Getting batches with status {start_status}")
|
||||||
|
batches = index_db.get_batches_by_status(start_status, 10000)
|
||||||
|
logger.info(f"Got {len(batches)} batch urls")
|
||||||
|
if len(batches) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
batch_data = batch_cache.get_cached([batch.url for batch in batches])
|
||||||
|
logger.info(f"Got {len(batch_data)} cached batches")
|
||||||
|
|
||||||
|
process(batch_data.values())
|
||||||
|
|
||||||
|
index_db.update_batch_status([batch.url for batch in batches], end_status)
|
67
mwmbl/indexer/update_urls.py
Normal file
67
mwmbl/indexer/update_urls.py
Normal file
|
@ -0,0 +1,67 @@
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from logging import getLogger
|
||||||
|
from typing import Iterable, Collection
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from mwmbl.crawler.batch import HashedBatch
|
||||||
|
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
|
||||||
|
from mwmbl.database import Database
|
||||||
|
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||||
|
from mwmbl.indexer import process_batch
|
||||||
|
from mwmbl.indexer.batch_cache import BatchCache
|
||||||
|
from mwmbl.indexer.index_batches import get_url_error_status
|
||||||
|
from mwmbl.indexer.indexdb import BatchStatus
|
||||||
|
from mwmbl.settings import UNKNOWN_DOMAIN_MULTIPLIER, EXCLUDED_DOMAINS, SCORE_FOR_SAME_DOMAIN, \
|
||||||
|
SCORE_FOR_DIFFERENT_DOMAIN, SCORE_FOR_ROOT_PATH
|
||||||
|
|
||||||
|
|
||||||
|
logger = getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run(batch_cache: BatchCache):
|
||||||
|
process_batch.run(batch_cache, BatchStatus.LOCAL, BatchStatus.URLS_UPDATED, process=record_urls_in_database)
|
||||||
|
|
||||||
|
|
||||||
|
def record_urls_in_database(batches: Collection[HashedBatch]):
|
||||||
|
logger.info(f"Recording URLs in database for {len(batches)} batches")
|
||||||
|
with Database() as db:
|
||||||
|
url_db = URLDatabase(db.connection)
|
||||||
|
url_scores = defaultdict(float)
|
||||||
|
url_users = {}
|
||||||
|
url_timestamps = {}
|
||||||
|
url_statuses = defaultdict(lambda: URLStatus.NEW)
|
||||||
|
for batch in batches:
|
||||||
|
for item in batch.items:
|
||||||
|
timestamp = get_datetime_from_timestamp(item.timestamp / 1000.0)
|
||||||
|
url_timestamps[item.url] = timestamp
|
||||||
|
url_users[item.url] = batch.user_id_hash
|
||||||
|
if item.content is None:
|
||||||
|
url_statuses[item.url] = get_url_error_status(item)
|
||||||
|
else:
|
||||||
|
url_statuses[item.url] = URLStatus.CRAWLED
|
||||||
|
crawled_page_domain = urlparse(item.url).netloc
|
||||||
|
score_multiplier = 1 if crawled_page_domain in DOMAINS else UNKNOWN_DOMAIN_MULTIPLIER
|
||||||
|
for link in item.content.links:
|
||||||
|
parsed_link = urlparse(link)
|
||||||
|
if parsed_link.netloc in EXCLUDED_DOMAINS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
score = SCORE_FOR_SAME_DOMAIN if parsed_link.netloc == crawled_page_domain else SCORE_FOR_DIFFERENT_DOMAIN
|
||||||
|
url_scores[link] += score * score_multiplier
|
||||||
|
url_users[link] = batch.user_id_hash
|
||||||
|
url_timestamps[link] = timestamp
|
||||||
|
domain = f'{parsed_link.scheme}://{parsed_link.netloc}/'
|
||||||
|
url_scores[domain] += SCORE_FOR_ROOT_PATH * score_multiplier
|
||||||
|
url_users[domain] = batch.user_id_hash
|
||||||
|
url_timestamps[domain] = timestamp
|
||||||
|
|
||||||
|
found_urls = [FoundURL(url, url_users[url], url_scores[url], url_statuses[url], url_timestamps[url])
|
||||||
|
for url in url_scores.keys() | url_statuses.keys()]
|
||||||
|
|
||||||
|
url_db.update_found_urls(found_urls)
|
||||||
|
|
||||||
|
|
||||||
|
def get_datetime_from_timestamp(timestamp: float) -> datetime:
|
||||||
|
batch_datetime = datetime(1970, 1, 1, tzinfo=timezone.utc) + timedelta(seconds=timestamp)
|
||||||
|
return batch_datetime
|
|
@ -27,3 +27,4 @@ SCORE_FOR_ROOT_PATH = 0.1
|
||||||
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
||||||
SCORE_FOR_SAME_DOMAIN = 0.01
|
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
||||||
|
EXCLUDED_DOMAINS = {'web.archive.org'}
|
||||||
|
|
Loading…
Reference in a new issue