Investigate duplication of URLs in batches
This commit is contained in:
parent
eb571fc5fe
commit
e27d749e18
10 changed files with 75 additions and 63 deletions
|
@ -7,15 +7,23 @@ import json
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
from mwmbl.crawler.batch import HashedBatch
|
||||||
|
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: remove this line - temporary override
|
||||||
|
CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"
|
||||||
|
|
||||||
|
|
||||||
def get_urls():
|
def get_urls():
|
||||||
for path in glob.glob(CRAWL_GLOB):
|
for path in glob.glob(CRAWL_GLOB):
|
||||||
data = json.load(gzip.open(path))
|
data = json.load(gzip.open(path))
|
||||||
user = data['user_id_hash']
|
batch = HashedBatch.parse_obj(data)
|
||||||
for item in data['items']:
|
user = batch.user_id_hash
|
||||||
yield user, item['url']
|
for item in batch.items:
|
||||||
|
if item.content is not None:
|
||||||
|
for url in item.content.links:
|
||||||
|
yield user, url
|
||||||
|
|
||||||
|
|
||||||
def analyse_urls(urls):
|
def analyse_urls(urls):
|
||||||
|
|
4
analyse/recent_batches.py
Normal file
4
analyse/recent_batches.py
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
"""
|
||||||
|
Analyse recent batches looking for duplicates.
|
||||||
|
"""
|
||||||
|
|
|
@ -4,14 +4,14 @@ import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
from typing import Optional, Union
|
from typing import Union
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
import requests
|
import requests
|
||||||
from fastapi import HTTPException, APIRouter
|
from fastapi import HTTPException, APIRouter
|
||||||
from pydantic import BaseModel
|
|
||||||
|
|
||||||
|
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||||
from mwmbl.crawler.urls import URLDatabase
|
from mwmbl.crawler.urls import URLDatabase
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
|
|
||||||
|
@ -43,40 +43,6 @@ def upload(data: bytes, name: str):
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
class ItemContent(BaseModel):
|
|
||||||
title: str
|
|
||||||
extract: str
|
|
||||||
links: list[str]
|
|
||||||
|
|
||||||
|
|
||||||
class ItemError(BaseModel):
|
|
||||||
name: str
|
|
||||||
message: Optional[str]
|
|
||||||
|
|
||||||
|
|
||||||
class Item(BaseModel):
|
|
||||||
url: str
|
|
||||||
status: Optional[int]
|
|
||||||
timestamp: int
|
|
||||||
content: Optional[ItemContent]
|
|
||||||
error: Optional[ItemError]
|
|
||||||
|
|
||||||
|
|
||||||
class Batch(BaseModel):
|
|
||||||
user_id: str
|
|
||||||
items: list[Item]
|
|
||||||
|
|
||||||
|
|
||||||
class NewBatchRequest(BaseModel):
|
|
||||||
user_id: str
|
|
||||||
|
|
||||||
|
|
||||||
class HashedBatch(BaseModel):
|
|
||||||
user_id_hash: str
|
|
||||||
timestamp: int
|
|
||||||
items: list[Item]
|
|
||||||
|
|
||||||
|
|
||||||
last_batch = None
|
last_batch = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -232,7 +198,6 @@ def get_subfolders(prefix):
|
||||||
items = client.list_objects(Bucket=BUCKET_NAME,
|
items = client.list_objects(Bucket=BUCKET_NAME,
|
||||||
Prefix=prefix,
|
Prefix=prefix,
|
||||||
Delimiter='/')
|
Delimiter='/')
|
||||||
print("Got items", items)
|
|
||||||
item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
|
item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
|
||||||
return item_keys
|
return item_keys
|
||||||
|
|
||||||
|
|
37
mwmbl/crawler/batch.py
Normal file
37
mwmbl/crawler/batch.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ItemContent(BaseModel):
|
||||||
|
title: str
|
||||||
|
extract: str
|
||||||
|
links: list[str]
|
||||||
|
|
||||||
|
|
||||||
|
class ItemError(BaseModel):
|
||||||
|
name: str
|
||||||
|
message: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class Item(BaseModel):
|
||||||
|
url: str
|
||||||
|
status: Optional[int]
|
||||||
|
timestamp: int
|
||||||
|
content: Optional[ItemContent]
|
||||||
|
error: Optional[ItemError]
|
||||||
|
|
||||||
|
|
||||||
|
class Batch(BaseModel):
|
||||||
|
user_id: str
|
||||||
|
items: list[Item]
|
||||||
|
|
||||||
|
|
||||||
|
class NewBatchRequest(BaseModel):
|
||||||
|
user_id: str
|
||||||
|
|
||||||
|
|
||||||
|
class HashedBatch(BaseModel):
|
||||||
|
user_id_hash: str
|
||||||
|
timestamp: int
|
||||||
|
items: list[Item]
|
|
@ -3,7 +3,8 @@ from datetime import date, datetime, timedelta
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
|
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
|
||||||
create_historical_batch, HashedBatch, get_batch_url
|
create_historical_batch, get_batch_url
|
||||||
|
from mwmbl.crawler.batch import HashedBatch
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
|
from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
|
||||||
from mwmbl.indexer.index import tokenize_document
|
from mwmbl.indexer.index import tokenize_document
|
||||||
|
@ -17,13 +18,13 @@ def run():
|
||||||
for day in range(DAYS):
|
for day in range(DAYS):
|
||||||
date_str = str(date.today() - timedelta(days=day))
|
date_str = str(date.today() - timedelta(days=day))
|
||||||
users = get_user_id_hashes_for_date(date_str)
|
users = get_user_id_hashes_for_date(date_str)
|
||||||
print("Users", users)
|
print(f"Got {len(users)} for day {date_str}")
|
||||||
with Database() as db:
|
with Database() as db:
|
||||||
index_db = IndexDatabase(db.connection)
|
index_db = IndexDatabase(db.connection)
|
||||||
index_db.create_tables()
|
index_db.create_tables()
|
||||||
for user in users:
|
for user in users:
|
||||||
batches = get_batches_for_date_and_user(date_str, user)
|
batches = get_batches_for_date_and_user(date_str, user)
|
||||||
print("Batches", batches)
|
print("Historical batches for user", user, len(batches))
|
||||||
batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
|
batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
|
||||||
infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
|
infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
|
||||||
index_db.record_batches(infos)
|
index_db.record_batches(infos)
|
||||||
|
|
|
@ -12,7 +12,6 @@ from mwmbl.tinysearchengine.indexer import Document
|
||||||
class BatchStatus(Enum):
|
class BatchStatus(Enum):
|
||||||
REMOTE = 0 # The batch only exists in long term storage
|
REMOTE = 0 # The batch only exists in long term storage
|
||||||
LOCAL = 1 # We have a copy of the batch locally in Postgresql
|
LOCAL = 1 # We have a copy of the batch locally in Postgresql
|
||||||
INDEXED = 2 # The batch has been indexed and the local data has been deleted
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentStatus(Enum):
|
class DocumentStatus(Enum):
|
||||||
|
@ -108,6 +107,7 @@ class IndexDatabase:
|
||||||
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
|
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
|
||||||
for document in sorted_documents]
|
for document in sorted_documents]
|
||||||
|
|
||||||
|
print("Queueing documents", len(data))
|
||||||
with self.connection.cursor() as cursor:
|
with self.connection.cursor() as cursor:
|
||||||
execute_values(cursor, sql, data)
|
execute_values(cursor, sql, data)
|
||||||
|
|
||||||
|
@ -126,7 +126,6 @@ class IndexDatabase:
|
||||||
with self.connection.cursor() as cursor:
|
with self.connection.cursor() as cursor:
|
||||||
cursor.execute(sql)
|
cursor.execute(sql)
|
||||||
results = cursor.fetchall()
|
results = cursor.fetchall()
|
||||||
print("Results", results)
|
|
||||||
return [Document(title, url, extract, score) for url, title, extract, score in results]
|
return [Document(title, url, extract, score) for url, title, extract, score in results]
|
||||||
|
|
||||||
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
|
def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
|
||||||
|
@ -134,7 +133,7 @@ class IndexDatabase:
|
||||||
INSERT INTO document_pages (url, page) values %s
|
INSERT INTO document_pages (url, page) values %s
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print("Queuing", urls_and_page_indexes)
|
print(f"Queuing {len(urls_and_page_indexes)} urls and page indexes")
|
||||||
with self.connection.cursor() as cursor:
|
with self.connection.cursor() as cursor:
|
||||||
execute_values(cursor, sql, urls_and_page_indexes)
|
execute_values(cursor, sql, urls_and_page_indexes)
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,7 @@ def run_preprocessing(index_path):
|
||||||
with Database() as db:
|
with Database() as db:
|
||||||
index_db = IndexDatabase(db.connection)
|
index_db = IndexDatabase(db.connection)
|
||||||
documents = index_db.get_documents_for_preprocessing()
|
documents = index_db.get_documents_for_preprocessing()
|
||||||
print(f"Got {len(documents)} documents")
|
print(f"Got {len(documents)} documents for preprocessing")
|
||||||
if len(documents) == 0:
|
if len(documents) == 0:
|
||||||
sleep(10)
|
sleep(10)
|
||||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from time import sleep
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from mwmbl.crawler.app import HashedBatch
|
from mwmbl.crawler.batch import HashedBatch
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
|
from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
|
||||||
from mwmbl.retry import retry_requests
|
from mwmbl.retry import retry_requests
|
||||||
|
@ -26,21 +26,19 @@ def retrieve_batches():
|
||||||
with Database() as db:
|
with Database() as db:
|
||||||
index_db = IndexDatabase(db.connection)
|
index_db = IndexDatabase(db.connection)
|
||||||
batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
|
batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
|
||||||
print("Batches", batches)
|
print(f"Found {len(batches)} remote batches")
|
||||||
urls = [batch.url for batch in batches]
|
urls = [batch.url for batch in batches]
|
||||||
pool = ThreadPool(NUM_THREADS)
|
pool = ThreadPool(NUM_THREADS)
|
||||||
results = pool.imap_unordered(retrieve_batch, urls)
|
results = pool.imap_unordered(retrieve_batch, urls)
|
||||||
for result in results:
|
for result in results:
|
||||||
print("Processed batch with items:", result)
|
print("Processed batch with items:", result)
|
||||||
|
index_db.update_batch_status(urls, BatchStatus.LOCAL)
|
||||||
with Database() as db:
|
|
||||||
index_db = IndexDatabase(db.connection)
|
|
||||||
index_db.update_batch_status(urls, BatchStatus.LOCAL)
|
|
||||||
|
|
||||||
|
|
||||||
def retrieve_batch(url):
|
def retrieve_batch(url):
|
||||||
data = json.loads(gzip.decompress(retry_requests.get(url).content))
|
data = json.loads(gzip.decompress(retry_requests.get(url).content))
|
||||||
batch = HashedBatch.parse_obj(data)
|
batch = HashedBatch.parse_obj(data)
|
||||||
|
print(f"Retrieved batch with {len(batch.items)} items")
|
||||||
queue_batch(batch)
|
queue_batch(batch)
|
||||||
return len(batch.items)
|
return len(batch.items)
|
||||||
|
|
||||||
|
|
|
@ -18,21 +18,20 @@ def run_update(index_path):
|
||||||
for i in range(indexer.num_pages):
|
for i in range(indexer.num_pages):
|
||||||
with Database() as db:
|
with Database() as db:
|
||||||
index_db = IndexDatabase(db.connection)
|
index_db = IndexDatabase(db.connection)
|
||||||
pages = index_db.get_queued_documents_for_page(i)
|
documents = index_db.get_queued_documents_for_page(i)
|
||||||
if len(pages) > 0:
|
print(f"Documents queued for page {i}: {len(documents)}")
|
||||||
print("Pages", len(pages))
|
if len(documents) == 0:
|
||||||
else:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
for j in range(3):
|
for j in range(3):
|
||||||
try:
|
try:
|
||||||
indexer.add_to_page(i, pages)
|
indexer.add_to_page(i, documents)
|
||||||
break
|
break
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pages = pages[:len(pages)//2]
|
documents = documents[:len(documents)//2]
|
||||||
if len(pages) == 0:
|
if len(documents) == 0:
|
||||||
break
|
break
|
||||||
print(f"Not enough space, adding {len(pages)}")
|
print(f"Not enough space, adding {len(documents)}")
|
||||||
index_db.clear_queued_documents_for_page(i)
|
index_db.clear_queued_documents_for_page(i)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@ from multiprocessing import Process
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
|
|
||||||
from mwmbl.indexer import historical, retrieve, preprocess
|
from mwmbl.indexer import historical, retrieve, preprocess, update_pages
|
||||||
from mwmbl.crawler.app import router as crawler_router
|
from mwmbl.crawler.app import router as crawler_router
|
||||||
from mwmbl.tinysearchengine import search
|
from mwmbl.tinysearchengine import search
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
|
@ -43,6 +43,7 @@ def run():
|
||||||
Process(target=historical.run).start()
|
Process(target=historical.run).start()
|
||||||
Process(target=retrieve.run).start()
|
Process(target=retrieve.run).start()
|
||||||
Process(target=preprocess.run, args=(args.index,)).start()
|
Process(target=preprocess.run, args=(args.index,)).start()
|
||||||
|
Process(target=update_pages.run, args=(args.index,)).start()
|
||||||
|
|
||||||
completer = Completer()
|
completer = Completer()
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue