Investigate duplication of URLs in batches

2022-06-26 21:11:51 +01:00 · 2022-06-26 21:11:51 +01:00 · e27d749e18
commit e27d749e18
parent eb571fc5fe
10 changed files with 75 additions and 63 deletions
--- a/analyse/analyse_crawled_domains.py
+++ b/analyse/analyse_crawled_domains.py
@ -7,15 +7,23 @@ import json
 from collections import defaultdict, Counter
 from urllib.parse import urlparse
-from mwmbl.indexer.paths import CRAWL_GLOB
+from mwmbl.crawler.batch import HashedBatch
 from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
 # TODO: remove this line - temporary override
 CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/2022-06-23/*/*/*.json.gz"
 def get_urls():
    for path in glob.glob(CRAWL_GLOB):
        data = json.load(gzip.open(path))
-        user = data['user_id_hash']
+        batch = HashedBatch.parse_obj(data)
-        for item in data['items']:
+        user = batch.user_id_hash
-            yield user, item['url']
+        for item in batch.items:
            if item.content is not None:
                for url in item.content.links:
                    yield user, url
 def analyse_urls(urls):
--- a/analyse/recent_batches.py
+++ b/analyse/recent_batches.py
@ -0,0 +1,4 @@
 """
 Analyse recent batches looking for duplicates.
 """
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@ -4,14 +4,14 @@ import json
 import os
 import re
 from datetime import datetime, timezone, timedelta
-from typing import Optional, Union
+from typing import Union
 from uuid import uuid4
 import boto3
 import requests
 from fastapi import HTTPException, APIRouter
 from pydantic import BaseModel
 from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
 from mwmbl.crawler.urls import URLDatabase
 from mwmbl.database import Database
@ -43,40 +43,6 @@ def upload(data: bytes, name: str):
    return result
 class ItemContent(BaseModel):
    title: str
    extract: str
    links: list[str]
 class ItemError(BaseModel):
    name: str
    message: Optional[str]
 class Item(BaseModel):
    url: str
    status: Optional[int]
    timestamp: int
    content: Optional[ItemContent]
    error: Optional[ItemError]
 class Batch(BaseModel):
    user_id: str
    items: list[Item]
 class NewBatchRequest(BaseModel):
    user_id: str
 class HashedBatch(BaseModel):
    user_id_hash: str
    timestamp: int
    items: list[Item]
 last_batch = None
@ -232,7 +198,6 @@ def get_subfolders(prefix):
    items = client.list_objects(Bucket=BUCKET_NAME,
                                Prefix=prefix,
                                Delimiter='/')
    print("Got items", items)
    item_keys = [item['Prefix'][len(prefix):].strip('/') for item in items['CommonPrefixes']]
    return item_keys
--- a/mwmbl/crawler/batch.py
+++ b/mwmbl/crawler/batch.py
@ -0,0 +1,37 @@
 from typing import Optional
 from pydantic import BaseModel
 class ItemContent(BaseModel):
    title: str
    extract: str
    links: list[str]
 class ItemError(BaseModel):
    name: str
    message: Optional[str]
 class Item(BaseModel):
    url: str
    status: Optional[int]
    timestamp: int
    content: Optional[ItemContent]
    error: Optional[ItemError]
 class Batch(BaseModel):
    user_id: str
    items: list[Item]
 class NewBatchRequest(BaseModel):
    user_id: str
 class HashedBatch(BaseModel):
    user_id_hash: str
    timestamp: int
    items: list[Item]
--- a/mwmbl/indexer/historical.py
+++ b/mwmbl/indexer/historical.py
@ -3,7 +3,8 @@ from datetime import date, datetime, timedelta
 import spacy
 from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
-    create_historical_batch, HashedBatch, get_batch_url
+    create_historical_batch, get_batch_url
 from mwmbl.crawler.batch import HashedBatch
 from mwmbl.database import Database
 from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
 from mwmbl.indexer.index import tokenize_document
@ -17,13 +18,13 @@ def run():
    for day in range(DAYS):
        date_str = str(date.today() - timedelta(days=day))
        users = get_user_id_hashes_for_date(date_str)
-        print("Users", users)
+        print(f"Got {len(users)} for day {date_str}")
        with Database() as db:
            index_db = IndexDatabase(db.connection)
            index_db.create_tables()
            for user in users:
                batches = get_batches_for_date_and_user(date_str, user)
-                print("Batches", batches)
+                print("Historical batches for user", user, len(batches))
                batch_urls = [get_batch_url(batch_id, date_str, user) for batch_id in batches["batch_ids"]]
                infos = [BatchInfo(url, user, BatchStatus.REMOTE) for url in batch_urls]
                index_db.record_batches(infos)
--- a/mwmbl/indexer/indexdb.py
+++ b/mwmbl/indexer/indexdb.py
@ -12,7 +12,6 @@ from mwmbl.tinysearchengine.indexer import Document
 class BatchStatus(Enum):
    REMOTE = 0    # The batch only exists in long term storage
    LOCAL = 1     # We have a copy of the batch locally in Postgresql
    INDEXED = 2   # The batch has been indexed and the local data has been deleted
 class DocumentStatus(Enum):
@ -108,6 +107,7 @@ class IndexDatabase:
        data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
                for document in sorted_documents]
        print("Queueing documents", len(data))
        with self.connection.cursor() as cursor:
            execute_values(cursor, sql, data)
@ -126,7 +126,6 @@ class IndexDatabase:
        with self.connection.cursor() as cursor:
            cursor.execute(sql)
            results = cursor.fetchall()
            print("Results", results)
            return [Document(title, url, extract, score) for url, title, extract, score in results]
    def queue_documents_for_page(self, urls_and_page_indexes: list[tuple[str, int]]):
@ -134,7 +133,7 @@ class IndexDatabase:
        INSERT INTO document_pages (url, page) values %s
        """
-        print("Queuing", urls_and_page_indexes)
+        print(f"Queuing {len(urls_and_page_indexes)} urls and page indexes")
        with self.connection.cursor() as cursor:
            execute_values(cursor, sql, urls_and_page_indexes)
--- a/mwmbl/indexer/preprocess.py
+++ b/mwmbl/indexer/preprocess.py
@ -27,7 +27,7 @@ def run_preprocessing(index_path):
    with Database() as db:
        index_db = IndexDatabase(db.connection)
        documents = index_db.get_documents_for_preprocessing()
-        print(f"Got {len(documents)} documents")
+        print(f"Got {len(documents)} documents for preprocessing")
        if len(documents) == 0:
            sleep(10)
        with TinyIndex(Document, index_path, 'w') as indexer:
--- a/mwmbl/indexer/retrieve.py
+++ b/mwmbl/indexer/retrieve.py
@ -9,7 +9,7 @@ from time import sleep
 import requests
-from mwmbl.crawler.app import HashedBatch
+from mwmbl.crawler.batch import HashedBatch
 from mwmbl.database import Database
 from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
 from mwmbl.retry import retry_requests
@ -26,21 +26,19 @@ def retrieve_batches():
    with Database() as db:
        index_db = IndexDatabase(db.connection)
        batches = index_db.get_batches_by_status(BatchStatus.REMOTE)
-        print("Batches", batches)
+        print(f"Found {len(batches)} remote batches")
        urls = [batch.url for batch in batches]
        pool = ThreadPool(NUM_THREADS)
        results = pool.imap_unordered(retrieve_batch, urls)
        for result in results:
            print("Processed batch with items:", result)
-
+        index_db.update_batch_status(urls, BatchStatus.LOCAL)
        with Database() as db:
            index_db = IndexDatabase(db.connection)
            index_db.update_batch_status(urls, BatchStatus.LOCAL)
 def retrieve_batch(url):
    data = json.loads(gzip.decompress(retry_requests.get(url).content))
    batch = HashedBatch.parse_obj(data)
    print(f"Retrieved batch with {len(batch.items)} items")
    queue_batch(batch)
    return len(batch.items)
--- a/mwmbl/indexer/update_pages.py
+++ b/mwmbl/indexer/update_pages.py
@ -18,21 +18,20 @@ def run_update(index_path):
        for i in range(indexer.num_pages):
            with Database() as db:
                index_db = IndexDatabase(db.connection)
-                pages = index_db.get_queued_documents_for_page(i)
+                documents = index_db.get_queued_documents_for_page(i)
-                if len(pages) > 0:
+                print(f"Documents queued for page {i}: {len(documents)}")
-                    print("Pages", len(pages))
+                if len(documents) == 0:
                else:
                    continue
                for j in range(3):
                    try:
-                        indexer.add_to_page(i, pages)
+                        indexer.add_to_page(i, documents)
                        break
                    except ValueError:
-                        pages = pages[:len(pages)//2]
+                        documents = documents[:len(documents)//2]
-                        if len(pages) == 0:
+                        if len(documents) == 0:
                            break
-                        print(f"Not enough space, adding {len(pages)}")
+                        print(f"Not enough space, adding {len(documents)}")
                index_db.clear_queued_documents_for_page(i)
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@ -6,7 +6,7 @@ from multiprocessing import Process
 import uvicorn
 from fastapi import FastAPI
-from mwmbl.indexer import historical, retrieve, preprocess
+from mwmbl.indexer import historical, retrieve, preprocess, update_pages
 from mwmbl.crawler.app import router as crawler_router
 from mwmbl.tinysearchengine import search
 from mwmbl.tinysearchengine.completer import Completer
@ -43,6 +43,7 @@ def run():
    Process(target=historical.run).start()
    Process(target=retrieve.run).start()
    Process(target=preprocess.run, args=(args.index,)).start()
    Process(target=update_pages.run, args=(args.index,)).start()
    completer = Completer()