Investigate duplication of URLs in batches
This commit is contained in:
parent
eb571fc5fe
commit
e27d749e18
10 changed files with 75 additions and 63 deletions
|
@ -7,15 +7,23 @@ import json
|
|||
from collections import defaultdict, Counter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
|
||||
|
||||
|
||||
# TODO: remove this line - temporary override
|
||||
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"
|
||||
|
||||
|
||||
def get_urls():
|
||||
for path in glob.glob(CRAWL_GLOB):
|
||||
data = json.load(gzip.open(path))
|
||||
user = data['user_id_hash']
|
||||
for item in data['items']:
|
||||
yield user, item['url']
|
||||
batch = HashedBatch.parse_obj(data)
|
||||
user = batch.user_id_hash
|
||||
for item in batch.items:
|
||||
if item.content is not None:
|
||||
for url in item.content.links:
|
||||
yield user, url
|
||||
|
||||
|
||||
def analyse_urls(urls):
|
||||
|
|
4
analyse/recent_batches.py
Normal file
4
analyse/recent_batches.py
Normal file
|
@ -0,0 +1,4 @@
|
|||
"""
|
||||
Analyse recent batches looking for duplicates.
|
||||
"""
|
||||
|
|
@ -4,14 +4,14 @@ import json
|
|||
import os
|
||||
import re
|
||||
from datetime import datetime, timezone, timedelta
|
||||
from typing import Optional, Union
|
||||
from typing import Union
|
||||
from uuid import uuid4
|
||||
|
||||
import boto3
|
||||
import requests
|
||||
from fastapi import HTTPException, APIRouter
|
||||
from pydantic import BaseModel
|
||||
|
||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||
from mwmbl.crawler.urls import URLDatabase
|
||||
from mwmbl.database import Database
|
||||
|
||||
|
@ -43,40 +43,6 @@ def upload(data: bytes, name: str):
|
|||
return result
|
||||
|
||||
|
||||
class ItemContent(BaseModel):
|
||||
title: str
|
||||
extract: str
|
||||
links: list[str]
|
||||
|
||||
|
||||
class ItemError(BaseModel):
|
||||
name: str
|
||||
message: Optional[str]
|
||||
|
||||
|
||||
class Item(BaseModel):
|
||||
url: str
|
||||
status: Optional[int]
|
||||
timestamp: int
|
||||
content: Optional[ItemContent]
|
||||
error: Optional[ItemError]
|
||||
|
||||
|
||||
class Batch(BaseModel):
|
||||
user_id: str
|
||||
items: list[Item]
|
||||
|
||||
|
||||
class NewBatchRequest(BaseModel):
|
||||
user_id: str
|
||||
|
||||
|
||||
class HashedBatch(BaseModel):
|
||||
user_id_hash: str
|
||||
timestamp: int
|
||||
items: list[Item]
|
||||
|
||||
|
||||
last_batch = None
|
||||
|
||||
|
||||
|
@ -232,7 +198,6 @@ def get_subfolders(prefix):
|
|||
items = client.list_objects(Bucket=BUCKET_NAME,
|
||||
Prefix=prefix,
|
||||
Delimiter='/')
|
||||
print("Got items", items)
|
||||
item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
|
||||
return item_keys
|
||||
|
||||
|
|
37
mwmbl/crawler/batch.py
Normal file
37
mwmbl/crawler/batch.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ItemContent(BaseModel):
|
||||
title: str
|
||||
extract: str
|
||||
links: list[str]
|
||||
|
||||
|
||||
class ItemError(BaseModel):
|
||||
name: str
|
||||
message: Optional[str]
|
||||
|
||||
|
||||
class Item(BaseModel):
|
||||
url: str
|
||||
status: Optional[int]
|
||||
timestamp: int
|
||||
content: Optional[ItemContent]
|
||||
error: Optional[ItemError]
|
||||
|
||||
|
||||
class Batch(BaseModel):
|
||||
user_id: str
|
||||
items: list[Item]
|
||||
|
||||
|
||||
class NewBatchRequest(BaseModel):
|
||||
user_id: str
|
||||
|
||||
|
||||
class HashedBatch(BaseModel):
|
||||
user_id_hash: str
|
||||
timestamp: int
|
||||
items: list[Item]
|
|
@ -3,7 +3,8 @@ from datetime import date, datetime, timedelta
|
|||
import spacy
|
||||
|
||||
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
|
||||
create_historical_batch, HashedBatch, get_batch_url
|
||||
create_historical_batch, get_batch_url
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
|
||||
from mwmbl.indexer.index import tokenize_document
|
||||
|
@ -17,13 +18,13 @@ def run():
|
|||
for day in range(DAYS):
|
||||
date_str = str(date.today() - timedelta(days=day))
|
||||
users = get_user_id_hashes_for_date(date_str)
|
||||
print("Users", users)
|
||||
print(f"Got {len(users)} for day {date_str}")
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.create_tables()
|
||||
for user in users:
|
||||
batches = get_batches_for_date_and_user(date_str, user)
|
||||
print("Batches", batches)
|
||||
print("Historical batches for user", user, len(batches))
|
||||
batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
|
||||
infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
|
||||
index_db.record_batches(infos)
|
||||
|
|
|
@ -12,7 +12,6 @@ from mwmbl.tinysearchengine.indexer import Document
|
|||
class BatchStatus(Enum):
|
||||
REMOTE = 0 # The batch only exists in long term storage
|
||||
LOCAL = 1 # We have a copy of the batch locally in Postgresql
|
||||
INDEXED = 2 # The batch has been indexed and the local data has been deleted
|
||||
|
||||
|
||||
class DocumentStatus(Enum):
|
||||
|
@ -108,6 +107,7 @@ class IndexDatabase:
|
|||
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
|
||||
for document in sorted_documents]
|
||||
|
||||
print("Queueing documents", len(data))
|
||||
with self.connection.cursor() as cursor:
|
||||
execute_values(cursor, sql, data)
|
||||
|
||||
|
@ -126,7 +126,6 @@ class IndexDatabase:
|
|||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql)
|
||||
results = cursor.fetchall()
|
||||
print("Results", results)
|
||||
return [Document(title, url, extract, score) for url, title, extract, score in results]
|
||||
|
||||
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
|
||||
|
@ -134,7 +133,7 @@ class IndexDatabase:
|
|||
INSERT INTO document_pages (url, page) values %s
|
||||
"""
|
||||
|
||||
print("Queuing", urls_and_page_indexes)
|
||||
print(f"Queuing {len(urls_and_page_indexes)} urls and page indexes")
|
||||
with self.connection.cursor() as cursor:
|
||||
execute_values(cursor, sql, urls_and_page_indexes)
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ def run_preprocessing(index_path):
|
|||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
documents = index_db.get_documents_for_preprocessing()
|
||||
print(f"Got {len(documents)} documents")
|
||||
print(f"Got {len(documents)} documents for preprocessing")
|
||||
if len(documents) == 0:
|
||||
sleep(10)
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
|
|
|
@ -9,7 +9,7 @@ from time import sleep
|
|||
|
||||
import requests
|
||||
|
||||
from mwmbl.crawler.app import HashedBatch
|
||||
from mwmbl.crawler.batch import HashedBatch
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
|
||||
from mwmbl.retry import retry_requests
|
||||
|
@ -26,21 +26,19 @@ def retrieve_batches():
|
|||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
|
||||
print("Batches", batches)
|
||||
print(f"Found {len(batches)} remote batches")
|
||||
urls = [batch.url for batch in batches]
|
||||
pool = ThreadPool(NUM_THREADS)
|
||||
results = pool.imap_unordered(retrieve_batch, urls)
|
||||
for result in results:
|
||||
print("Processed batch with items:", result)
|
||||
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.update_batch_status(urls, BatchStatus.LOCAL)
|
||||
|
||||
|
||||
def retrieve_batch(url):
|
||||
data = json.loads(gzip.decompress(retry_requests.get(url).content))
|
||||
batch = HashedBatch.parse_obj(data)
|
||||
print(f"Retrieved batch with {len(batch.items)} items")
|
||||
queue_batch(batch)
|
||||
return len(batch.items)
|
||||
|
||||
|
|
|
@ -18,21 +18,20 @@ def run_update(index_path):
|
|||
for i in range(indexer.num_pages):
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
pages = index_db.get_queued_documents_for_page(i)
|
||||
if len(pages) > 0:
|
||||
print("Pages", len(pages))
|
||||
else:
|
||||
documents = index_db.get_queued_documents_for_page(i)
|
||||
print(f"Documents queued for page {i}: {len(documents)}")
|
||||
if len(documents) == 0:
|
||||
continue
|
||||
|
||||
for j in range(3):
|
||||
try:
|
||||
indexer.add_to_page(i, pages)
|
||||
indexer.add_to_page(i, documents)
|
||||
break
|
||||
except ValueError:
|
||||
pages = pages[:len(pages)//2]
|
||||
if len(pages) == 0:
|
||||
documents = documents[:len(documents)//2]
|
||||
if len(documents) == 0:
|
||||
break
|
||||
print(f"Not enough space, adding {len(pages)}")
|
||||
print(f"Not enough space, adding {len(documents)}")
|
||||
index_db.clear_queued_documents_for_page(i)
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from multiprocessing import Process
|
|||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
|
||||
from mwmbl.indexer import historical, retrieve, preprocess
|
||||
from mwmbl.indexer import historical, retrieve, preprocess, update_pages
|
||||
from mwmbl.crawler.app import router as crawler_router
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
|
@ -43,6 +43,7 @@ def run():
|
|||
Process(target=historical.run).start()
|
||||
Process(target=retrieve.run).start()
|
||||
Process(target=preprocess.run, args=(args.index,)).start()
|
||||
Process(target=update_pages.run, args=(args.index,)).start()
|
||||
|
||||
completer = Completer()
|
||||
|
||||
|
|
Loading…
Reference in a new issue