Investigate duplication of URLs in batches

This commit is contained in:
Daoud Clarke 2022-06-26 21:11:51 +01:00
parent eb571fc5fe
commit e27d749e18
10 changed files with 75 additions and 63 deletions

View file

@ -7,15 +7,23 @@ import json
from collections import defaultdict, Counter from collections import defaultdict, Counter
from urllib.parse import urlparse from urllib.parse import urlparse
from mwmbl.indexer.paths import CRAWL_GLOB from mwmbl.crawler.batch import HashedBatch
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"
def get_urls(): def get_urls():
for path in glob.glob(CRAWL_GLOB): for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path)) data = json.load(gzip.open(path))
user = data['user_id_hash'] batch = HashedBatch.parse_obj(data)
for item in data['items']: user = batch.user_id_hash
yield user, item['url'] for item in batch.items:
if item.content is not None:
for url in item.content.links:
yield user, url
def analyse_urls(urls): def analyse_urls(urls):

View file

@ -0,0 +1,4 @@
"""
Analyse recent batches looking for duplicates.
"""

View file

@ -4,14 +4,14 @@ import json
import os import os
import re import re
from datetime import datetime, timezone, timedelta from datetime import datetime, timezone, timedelta
from typing import Optional, Union from typing import Union
from uuid import uuid4 from uuid import uuid4
import boto3 import boto3
import requests import requests
from fastapi import HTTPException, APIRouter from fastapi import HTTPException, APIRouter
from pydantic import BaseModel
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.urls import URLDatabase from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database from mwmbl.database import Database
@ -43,40 +43,6 @@ def upload(data: bytes, name: str):
return result return result
class ItemContent(BaseModel):
title: str
extract: str
links: list[str]
class ItemError(BaseModel):
name: str
message: Optional[str]
class Item(BaseModel):
url: str
status: Optional[int]
timestamp: int
content: Optional[ItemContent]
error: Optional[ItemError]
class Batch(BaseModel):
user_id: str
items: list[Item]
class NewBatchRequest(BaseModel):
user_id: str
class HashedBatch(BaseModel):
user_id_hash: str
timestamp: int
items: list[Item]
last_batch = None last_batch = None
@ -232,7 +198,6 @@ def get_subfolders(prefix):
items = client.list_objects(Bucket=BUCKET_NAME, items = client.list_objects(Bucket=BUCKET_NAME,
Prefix=prefix, Prefix=prefix,
Delimiter='/') Delimiter='/')
print("Got items", items)
item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']] item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
return item_keys return item_keys

37
mwmbl/crawler/batch.py Normal file
View file

@ -0,0 +1,37 @@
from typing import Optional
from pydantic import BaseModel
class ItemContent(BaseModel):
title: str
extract: str
links: list[str]
class ItemError(BaseModel):
name: str
message: Optional[str]
class Item(BaseModel):
url: str
status: Optional[int]
timestamp: int
content: Optional[ItemContent]
error: Optional[ItemError]
class Batch(BaseModel):
user_id: str
items: list[Item]
class NewBatchRequest(BaseModel):
user_id: str
class HashedBatch(BaseModel):
user_id_hash: str
timestamp: int
items: list[Item]

View file

@ -3,7 +3,8 @@ from datetime import date, datetime, timedelta
import spacy import spacy
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \ from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
create_historical_batch, HashedBatch, get_batch_url create_historical_batch, get_batch_url
from mwmbl.crawler.batch import HashedBatch
from mwmbl.database import Database from mwmbl.database import Database
from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
from mwmbl.indexer.index import tokenize_document from mwmbl.indexer.index import tokenize_document
@ -17,13 +18,13 @@ def run():
for day in range(DAYS): for day in range(DAYS):
date_str = str(date.today() - timedelta(days=day)) date_str = str(date.today() - timedelta(days=day))
users = get_user_id_hashes_for_date(date_str) users = get_user_id_hashes_for_date(date_str)
print("Users", users) print(f"Got {len(users)} for day {date_str}")
with Database() as db: with Database() as db:
index_db = IndexDatabase(db.connection) index_db = IndexDatabase(db.connection)
index_db.create_tables() index_db.create_tables()
for user in users: for user in users:
batches = get_batches_for_date_and_user(date_str, user) batches = get_batches_for_date_and_user(date_str, user)
print("Batches", batches) print("Historical batches for user", user, len(batches))
batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]] batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls] infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
index_db.record_batches(infos) index_db.record_batches(infos)

View file

@ -12,7 +12,6 @@ from mwmbl.tinysearchengine.indexer import Document
class BatchStatus(Enum): class BatchStatus(Enum):
REMOTE = 0 # The batch only exists in long term storage REMOTE = 0 # The batch only exists in long term storage
LOCAL = 1 # We have a copy of the batch locally in Postgresql LOCAL = 1 # We have a copy of the batch locally in Postgresql
INDEXED = 2 # The batch has been indexed and the local data has been deleted
class DocumentStatus(Enum): class DocumentStatus(Enum):
@ -108,6 +107,7 @@ class IndexDatabase:
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value) data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
for document in sorted_documents] for document in sorted_documents]
print("Queueing documents", len(data))
with self.connection.cursor() as cursor: with self.connection.cursor() as cursor:
execute_values(cursor, sql, data) execute_values(cursor, sql, data)
@ -126,7 +126,6 @@ class IndexDatabase:
with self.connection.cursor() as cursor: with self.connection.cursor() as cursor:
cursor.execute(sql) cursor.execute(sql)
results = cursor.fetchall() results = cursor.fetchall()
print("Results", results)
return [Document(title, url, extract, score) for url, title, extract, score in results] return [Document(title, url, extract, score) for url, title, extract, score in results]
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]): def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
@ -134,7 +133,7 @@ class IndexDatabase:
INSERT INTO document_pages (url, page) values %s INSERT INTO document_pages (url, page) values %s
""" """
print("Queuing", urls_and_page_indexes) print(f"Queuing {len(urls_and_page_indexes)} urls and page indexes")
with self.connection.cursor() as cursor: with self.connection.cursor() as cursor:
execute_values(cursor, sql, urls_and_page_indexes) execute_values(cursor, sql, urls_and_page_indexes)

View file

@ -27,7 +27,7 @@ def run_preprocessing(index_path):
with Database() as db: with Database() as db:
index_db = IndexDatabase(db.connection) index_db = IndexDatabase(db.connection)
documents = index_db.get_documents_for_preprocessing() documents = index_db.get_documents_for_preprocessing()
print(f"Got {len(documents)} documents") print(f"Got {len(documents)} documents for preprocessing")
if len(documents) == 0: if len(documents) == 0:
sleep(10) sleep(10)
with TinyIndex(Document, index_path, 'w') as indexer: with TinyIndex(Document, index_path, 'w') as indexer:

View file

@ -9,7 +9,7 @@ from time import sleep
import requests import requests
from mwmbl.crawler.app import HashedBatch from mwmbl.crawler.batch import HashedBatch
from mwmbl.database import Database from mwmbl.database import Database
from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
from mwmbl.retry import retry_requests from mwmbl.retry import retry_requests
@ -26,21 +26,19 @@ def retrieve_batches():
with Database() as db: with Database() as db:
index_db = IndexDatabase(db.connection) index_db = IndexDatabase(db.connection)
batches = index_db.get_batches_by_status(BatchStatus.REMOTE) batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
print("Batches", batches) print(f"Found {len(batches)} remote batches")
urls = [batch.url for batch in batches] urls = [batch.url for batch in batches]
pool = ThreadPool(NUM_THREADS) pool = ThreadPool(NUM_THREADS)
results = pool.imap_unordered(retrieve_batch, urls) results = pool.imap_unordered(retrieve_batch, urls)
for result in results: for result in results:
print("Processed batch with items:", result) print("Processed batch with items:", result)
index_db.update_batch_status(urls, BatchStatus.LOCAL)
with Database() as db:
index_db = IndexDatabase(db.connection)
index_db.update_batch_status(urls, BatchStatus.LOCAL)
def retrieve_batch(url): def retrieve_batch(url):
data = json.loads(gzip.decompress(retry_requests.get(url).content)) data = json.loads(gzip.decompress(retry_requests.get(url).content))
batch = HashedBatch.parse_obj(data) batch = HashedBatch.parse_obj(data)
print(f"Retrieved batch with {len(batch.items)} items")
queue_batch(batch) queue_batch(batch)
return len(batch.items) return len(batch.items)

View file

@ -18,21 +18,20 @@ def run_update(index_path):
for i in range(indexer.num_pages): for i in range(indexer.num_pages):
with Database() as db: with Database() as db:
index_db = IndexDatabase(db.connection) index_db = IndexDatabase(db.connection)
pages = index_db.get_queued_documents_for_page(i) documents = index_db.get_queued_documents_for_page(i)
if len(pages) > 0: print(f"Documents queued for page {i}: {len(documents)}")
print("Pages", len(pages)) if len(documents) == 0:
else:
continue continue
for j in range(3): for j in range(3):
try: try:
indexer.add_to_page(i, pages) indexer.add_to_page(i, documents)
break break
except ValueError: except ValueError:
pages = pages[:len(pages)//2] documents = documents[:len(documents)//2]
if len(pages) == 0: if len(documents) == 0:
break break
print(f"Not enough space, adding {len(pages)}") print(f"Not enough space, adding {len(documents)}")
index_db.clear_queued_documents_for_page(i) index_db.clear_queued_documents_for_page(i)

View file

@ -6,7 +6,7 @@ from multiprocessing import Process
import uvicorn import uvicorn
from fastapi import FastAPI from fastapi import FastAPI
from mwmbl.indexer import historical, retrieve, preprocess from mwmbl.indexer import historical, retrieve, preprocess, update_pages
from mwmbl.crawler.app import router as crawler_router from mwmbl.crawler.app import router as crawler_router
from mwmbl.tinysearchengine import search from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.completer import Completer
@ -43,6 +43,7 @@ def run():
Process(target=historical.run).start() Process(target=historical.run).start()
Process(target=retrieve.run).start() Process(target=retrieve.run).start()
Process(target=preprocess.run, args=(args.index,)).start() Process(target=preprocess.run, args=(args.index,)).start()
Process(target=update_pages.run, args=(args.index,)).start()
completer = Completer() completer = Completer()