Investigate duplication of URLs in batches

This commit is contained in:
Daoud Clarke 2022-06-26 21:11:51 +01:00
parent eb571fc5fe
commit e27d749e18
10 changed files with 75 additions and 63 deletions

View file

@ -7,15 +7,23 @@ import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
from mwmbl.indexer.paths import CRAWL_GLOB
from mwmbl.crawler.batch import HashedBatch
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"
def get_urls():
for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path))
user = data['user_id_hash']
for item in data['items']:
yield user, item['url']
batch = HashedBatch.parse_obj(data)
user = batch.user_id_hash
for item in batch.items:
if item.content is not None:
for url in item.content.links:
yield user, url
def analyse_urls(urls):

View file

@ -0,0 +1,4 @@
"""
Analyse recent batches looking for duplicates.
"""

View file

@ -4,14 +4,14 @@ import json
import os
import re
from datetime import datetime, timezone, timedelta
from typing import Optional, Union
from typing import Union
from uuid import uuid4
import boto3
import requests
from fastapi import HTTPException, APIRouter
from pydantic import BaseModel
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
@ -43,40 +43,6 @@ def upload(data: bytes, name: str):
return result
class ItemContent(BaseModel):
title: str
extract: str
links: list[str]
class ItemError(BaseModel):
name: str
message: Optional[str]
class Item(BaseModel):
url: str
status: Optional[int]
timestamp: int
content: Optional[ItemContent]
error: Optional[ItemError]
class Batch(BaseModel):
user_id: str
items: list[Item]
class NewBatchRequest(BaseModel):
user_id: str
class HashedBatch(BaseModel):
user_id_hash: str
timestamp: int
items: list[Item]
last_batch = None
@ -232,7 +198,6 @@ def get_subfolders(prefix):
items = client.list_objects(Bucket=BUCKET_NAME,
Prefix=prefix,
Delimiter='/')
print("Got items", items)
item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
return item_keys

37
mwmbl/crawler/batch.py Normal file
View file

@ -0,0 +1,37 @@
from typing import Optional
from pydantic import BaseModel
class ItemContent(BaseModel):
title: str
extract: str
links: list[str]
class ItemError(BaseModel):
name: str
message: Optional[str]
class Item(BaseModel):
url: str
status: Optional[int]
timestamp: int
content: Optional[ItemContent]
error: Optional[ItemError]
class Batch(BaseModel):
user_id: str
items: list[Item]
class NewBatchRequest(BaseModel):
user_id: str
class HashedBatch(BaseModel):
user_id_hash: str
timestamp: int
items: list[Item]

View file

@ -3,7 +3,8 @@ from datetime import date, datetime, timedelta
import spacy
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
create_historical_batch, HashedBatch, get_batch_url
create_historical_batch, get_batch_url
from mwmbl.crawler.batch import HashedBatch
from mwmbl.database import Database
from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
from mwmbl.indexer.index import tokenize_document
@ -17,13 +18,13 @@ def run():
for day in range(DAYS):
date_str = str(date.today() - timedelta(days=day))
users = get_user_id_hashes_for_date(date_str)
print("Users", users)
print(f"Got {len(users)} for day {date_str}")
with Database() as db:
index_db = IndexDatabase(db.connection)
index_db.create_tables()
for user in users:
batches = get_batches_for_date_and_user(date_str, user)
print("Batches", batches)
print("Historical batches for user", user, len(batches))
batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
index_db.record_batches(infos)

View file

@ -12,7 +12,6 @@ from mwmbl.tinysearchengine.indexer import Document
class BatchStatus(Enum):
REMOTE = 0 # The batch only exists in long term storage
LOCAL = 1 # We have a copy of the batch locally in Postgresql
INDEXED = 2 # The batch has been indexed and the local data has been deleted
class DocumentStatus(Enum):
@ -108,6 +107,7 @@ class IndexDatabase:
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
for document in sorted_documents]
print("Queueing documents", len(data))
with self.connection.cursor() as cursor:
execute_values(cursor, sql, data)
@ -126,7 +126,6 @@ class IndexDatabase:
with self.connection.cursor() as cursor:
cursor.execute(sql)
results = cursor.fetchall()
print("Results", results)
return [Document(title, url, extract, score) for url, title, extract, score in results]
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
@ -134,7 +133,7 @@ class IndexDatabase:
INSERT INTO document_pages (url, page) values %s
"""
print("Queuing", urls_and_page_indexes)
print(f"Queuing {len(urls_and_page_indexes)} urls and page indexes")
with self.connection.cursor() as cursor:
execute_values(cursor, sql, urls_and_page_indexes)

View file

@ -27,7 +27,7 @@ def run_preprocessing(index_path):
with Database() as db:
index_db = IndexDatabase(db.connection)
documents = index_db.get_documents_for_preprocessing()
print(f"Got {len(documents)} documents")
print(f"Got {len(documents)} documents for preprocessing")
if len(documents) == 0:
sleep(10)
with TinyIndex(Document, index_path, 'w') as indexer:

View file

@ -9,7 +9,7 @@ from time import sleep
import requests
from mwmbl.crawler.app import HashedBatch
from mwmbl.crawler.batch import HashedBatch
from mwmbl.database import Database
from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
from mwmbl.retry import retry_requests
@ -26,21 +26,19 @@ def retrieve_batches():
with Database() as db:
index_db = IndexDatabase(db.connection)
batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
print("Batches", batches)
print(f"Found {len(batches)} remote batches")
urls = [batch.url for batch in batches]
pool = ThreadPool(NUM_THREADS)
results = pool.imap_unordered(retrieve_batch, urls)
for result in results:
print("Processed batch with items:", result)
with Database() as db:
index_db = IndexDatabase(db.connection)
index_db.update_batch_status(urls, BatchStatus.LOCAL)
index_db.update_batch_status(urls, BatchStatus.LOCAL)
def retrieve_batch(url):
data = json.loads(gzip.decompress(retry_requests.get(url).content))
batch = HashedBatch.parse_obj(data)
print(f"Retrieved batch with {len(batch.items)} items")
queue_batch(batch)
return len(batch.items)

View file

@ -18,21 +18,20 @@ def run_update(index_path):
for i in range(indexer.num_pages):
with Database() as db:
index_db = IndexDatabase(db.connection)
pages = index_db.get_queued_documents_for_page(i)
if len(pages) > 0:
print("Pages", len(pages))
else:
documents = index_db.get_queued_documents_for_page(i)
print(f"Documents queued for page {i}: {len(documents)}")
if len(documents) == 0:
continue
for j in range(3):
try:
indexer.add_to_page(i, pages)
indexer.add_to_page(i, documents)
break
except ValueError:
pages = pages[:len(pages)//2]
if len(pages) == 0:
documents = documents[:len(documents)//2]
if len(documents) == 0:
break
print(f"Not enough space, adding {len(pages)}")
print(f"Not enough space, adding {len(documents)}")
index_db.clear_queued_documents_for_page(i)

View file

@ -6,7 +6,7 @@ from multiprocessing import Process
import uvicorn
from fastapi import FastAPI
from mwmbl.indexer import historical, retrieve, preprocess
from mwmbl.indexer import historical, retrieve, preprocess, update_pages
from mwmbl.crawler.app import router as crawler_router
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
@ -43,6 +43,7 @@ def run():
Process(target=historical.run).start()
Process(target=retrieve.run).start()
Process(target=preprocess.run, args=(args.index,)).start()
Process(target=update_pages.run, args=(args.index,)).start()
completer = Completer()