commit
213bdaa365
35 changed files with 346 additions and 167 deletions
|
@ -46,5 +46,8 @@ VOLUME ["/data"]
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
|
|
||||||
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
|
ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
|
||||||
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
|
|
||||||
|
# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
|
||||||
|
# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
|
||||||
|
CMD ["/venv/bin/mwmbl-tinysearchengine"]
|
||||||
|
|
|
@ -7,8 +7,8 @@ import json
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from mwmbl.crawler.batch import HashedBatch
|
from mwmbl.crawler import HashedBatch
|
||||||
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
|
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
# TODO: remove this line - temporary override
|
# TODO: remove this line - temporary override
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
|
from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
|
||||||
"""
|
"""
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
from mwmbl.indexer.paths import URLS_PATH
|
from mwmbl.indexer import URLS_PATH
|
||||||
from mwmbl.app import get_config_and_index
|
from mwmbl.app import get_config_and_index
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,16 +7,15 @@ import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
from mwmbl.crawler.batch import HashedBatch
|
from mwmbl.crawler import HashedBatch
|
||||||
from mwmbl.crawler.urls import URLDatabase
|
from mwmbl.crawler.urls import URLDatabase
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
from mwmbl.indexer.index_batches import index_batches
|
from mwmbl.indexer import index_batches
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
|
|
||||||
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
||||||
NUM_BATCHES = 10000
|
NUM_BATCHES = 10000
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
"""
|
"""
|
||||||
Count unique URLs in the index.
|
Count unique URLs in the index.
|
||||||
"""
|
"""
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
|
|
@ -5,9 +5,9 @@ import numpy as np
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
from analyse.index_local import EVALUATE_INDEX_PATH
|
from analyse.index_local import EVALUATE_INDEX_PATH
|
||||||
from mwmbl.indexer.index import tokenize_document
|
from mwmbl.indexer import tokenize_document
|
||||||
from mwmbl.indexer.paths import INDEX_PATH
|
from mwmbl.indexer import INDEX_PATH
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||||
|
|
|
@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
|
||||||
import glob
|
import glob
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
from mwmbl.indexer import CRAWL_GLOB
|
||||||
|
|
||||||
|
|
||||||
API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
||||||
|
|
|
@ -2,9 +2,9 @@ import logging
|
||||||
import sys
|
import sys
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from mwmbl.indexer.paths import INDEX_PATH
|
from mwmbl.indexer import INDEX_PATH
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||||
|
|
|
@ -3,7 +3,7 @@ Send a batch to a running instance.
|
||||||
"""
|
"""
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from mwmbl.crawler.batch import Batch, Item, ItemContent
|
from mwmbl.crawler import Batch, Item, ItemContent
|
||||||
|
|
||||||
|
|
||||||
URL = 'http://localhost:5000/crawler/batches/'
|
URL = 'http://localhost:5000/crawler/batches/'
|
||||||
|
|
|
@ -4,7 +4,7 @@ from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
|
||||||
from mwmbl.indexer.update_urls import record_urls_in_database
|
from mwmbl.indexer import record_urls_in_database
|
||||||
|
|
||||||
|
|
||||||
def run_update_urls_on_fixed_batches():
|
def run_update_urls_on_fixed_batches():
|
||||||
|
|
22
manage.py
Executable file
22
manage.py
Executable file
|
@ -0,0 +1,22 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""Django's command-line utility for administrative tasks."""
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""Run administrative tasks."""
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
|
||||||
|
try:
|
||||||
|
from django.core.management import execute_from_command_line
|
||||||
|
except ImportError as exc:
|
||||||
|
raise ImportError(
|
||||||
|
"Couldn't import Django. Are you sure it's installed and "
|
||||||
|
"available on your PYTHONPATH environment variable? Did you "
|
||||||
|
"forget to activate a virtual environment?"
|
||||||
|
) from exc
|
||||||
|
execute_from_command_line(sys.argv)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
31
mwmbl/api.py
Normal file
31
mwmbl/api.py
Normal file
|
@ -0,0 +1,31 @@
|
||||||
|
from multiprocessing import Queue
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.conf import settings
|
||||||
|
from ninja import NinjaAPI
|
||||||
|
|
||||||
|
import mwmbl.crawler.app as crawler
|
||||||
|
from mwmbl.indexer.batch_cache import BatchCache
|
||||||
|
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||||
|
from mwmbl.tinysearchengine import search
|
||||||
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||||
|
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||||
|
|
||||||
|
api = NinjaAPI(version="1.0.0")
|
||||||
|
|
||||||
|
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||||
|
tiny_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||||
|
tiny_index.__enter__()
|
||||||
|
|
||||||
|
completer = Completer()
|
||||||
|
ranker = HeuristicRanker(tiny_index, completer)
|
||||||
|
|
||||||
|
search_router = search.create_router(ranker)
|
||||||
|
api.add_router("/search/", search_router)
|
||||||
|
|
||||||
|
batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME)
|
||||||
|
|
||||||
|
queued_batches = Queue()
|
||||||
|
crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches)
|
||||||
|
api.add_router("/crawler/", crawler_router)
|
35
mwmbl/apps.py
Normal file
35
mwmbl/apps.py
Normal file
|
@ -0,0 +1,35 @@
|
||||||
|
from multiprocessing import Process, Queue
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from django.apps import AppConfig
|
||||||
|
from django.conf import settings
|
||||||
|
|
||||||
|
from mwmbl.api import queued_batches
|
||||||
|
from mwmbl import background
|
||||||
|
from mwmbl.indexer.paths import INDEX_NAME
|
||||||
|
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||||
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
||||||
|
from mwmbl.url_queue import update_queue_continuously
|
||||||
|
|
||||||
|
|
||||||
|
class MwmblConfig(AppConfig):
|
||||||
|
name = "mwmbl"
|
||||||
|
verbose_name = "Mwmbl Application"
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
index_path = Path(settings.DATA_PATH) / INDEX_NAME
|
||||||
|
try:
|
||||||
|
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||||
|
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES:
|
||||||
|
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
|
||||||
|
f"({existing_index.num_pages}) do not match")
|
||||||
|
except FileNotFoundError:
|
||||||
|
print("Creating a new index")
|
||||||
|
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
|
||||||
|
page_size=PAGE_SIZE)
|
||||||
|
|
||||||
|
if settings.RUN_BACKGROUND_PROCESSES:
|
||||||
|
new_item_queue = Queue()
|
||||||
|
Process(target=background.run, args=(settings.DATA_PATH,)).start()
|
||||||
|
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
|
||||||
|
Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start()
|
16
mwmbl/asgi.py
Normal file
16
mwmbl/asgi.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
"""
|
||||||
|
ASGI config for app project.
|
||||||
|
|
||||||
|
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.asgi import get_asgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
|
||||||
|
|
||||||
|
application = get_asgi_application()
|
|
@ -10,10 +10,11 @@ from uuid import uuid4
|
||||||
import boto3
|
import boto3
|
||||||
import justext
|
import justext
|
||||||
import requests
|
import requests
|
||||||
from fastapi import HTTPException, APIRouter
|
from fastapi import HTTPException
|
||||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||||
|
from ninja import Router
|
||||||
from redis import Redis
|
from redis import Redis
|
||||||
|
|
||||||
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch
|
||||||
|
@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT,
|
||||||
return paragraphs, title
|
return paragraphs, title
|
||||||
|
|
||||||
|
|
||||||
def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router:
|
||||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
router = Router(tags=["crawler"])
|
||||||
|
|
||||||
@router.on_event("startup")
|
# TODO: # ensure tables are created before crawler code is used:
|
||||||
async def on_startup():
|
# #
|
||||||
with Database() as db:
|
# # url_db.create_tables()
|
||||||
url_db = URLDatabase(db.connection)
|
|
||||||
return url_db.create_tables()
|
|
||||||
|
|
||||||
@router.get('/fetch')
|
@router.get('/fetch')
|
||||||
def fetch_url(url: str, query: str):
|
def fetch_url(request, url: str, query: str):
|
||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English"))
|
||||||
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
good_paragraphs = [p for p in paragraphs if p.class_type == 'good']
|
||||||
|
@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||||
return format_result(result, query)
|
return format_result(result, query)
|
||||||
|
|
||||||
@router.post('/batches/')
|
@router.post('/batches/')
|
||||||
def post_batch(batch: Batch):
|
def post_batch(request, batch: Batch):
|
||||||
if len(batch.items) > MAX_BATCH_SIZE:
|
if len(batch.items) > MAX_BATCH_SIZE:
|
||||||
raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
|
raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}")
|
||||||
|
|
||||||
|
@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||||
}
|
}
|
||||||
|
|
||||||
@router.post('/batches/new')
|
@router.post('/batches/new')
|
||||||
def request_new_batch(batch_request: NewBatchRequest) -> list[str]:
|
def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]:
|
||||||
user_id_hash = _get_user_id_hash(batch_request)
|
user_id_hash = _get_user_id_hash(batch_request)
|
||||||
try:
|
try:
|
||||||
urls = queued_batches.get(block=False)
|
urls = queued_batches.get(block=False)
|
||||||
|
@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
@router.get('/batches/{date_str}/users/{public_user_id}')
|
@router.get('/batches/{date_str}/users/{public_user_id}')
|
||||||
def get_batches_for_date_and_user(date_str, public_user_id):
|
def get_batches_for_date_and_user(request, date_str, public_user_id):
|
||||||
check_date_str(date_str)
|
check_date_str(date_str)
|
||||||
check_public_user_id(public_user_id)
|
check_public_user_id(public_user_id)
|
||||||
prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
|
prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/'
|
||||||
return get_batch_ids_for_prefix(prefix)
|
return get_batch_ids_for_prefix(prefix)
|
||||||
|
|
||||||
@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
|
@router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}')
|
||||||
def get_batch_from_id(date_str, public_user_id, batch_id):
|
def get_batch_from_id(request, date_str, public_user_id, batch_id):
|
||||||
url = get_batch_url(batch_id, date_str, public_user_id)
|
url = get_batch_url(batch_id, date_str, public_user_id)
|
||||||
data = json.loads(gzip.decompress(requests.get(url).content))
|
data = json.loads(gzip.decompress(requests.get(url).content))
|
||||||
return {
|
return {
|
||||||
|
@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue):
|
||||||
'batch': data,
|
'batch': data,
|
||||||
}
|
}
|
||||||
|
|
||||||
@router.get('/latest-batch', response_model=list[HashedBatch])
|
@router.get('/latest-batch')
|
||||||
def get_latest_batch():
|
def get_latest_batch(request) -> list[HashedBatch]:
|
||||||
return [] if last_batch is None else [last_batch]
|
return [] if last_batch is None else [last_batch]
|
||||||
|
|
||||||
@router.get('/batches/{date_str}/users')
|
@router.get('/batches/{date_str}/users')
|
||||||
def get_user_id_hashes_for_date(date_str: str):
|
def get_user_id_hashes_for_date(request, date_str: str):
|
||||||
check_date_str(date_str)
|
check_date_str(date_str)
|
||||||
prefix = f'1/{VERSION}/{date_str}/1/'
|
prefix = f'1/{VERSION}/{date_str}/1/'
|
||||||
return get_subfolders(prefix)
|
return get_subfolders(prefix)
|
||||||
|
|
||||||
@router.get('/stats')
|
@router.get('/stats')
|
||||||
def get_stats() -> MwmblStats:
|
def get_stats(request) -> MwmblStats:
|
||||||
return stats_manager.get_stats()
|
return stats_manager.get_stats()
|
||||||
|
|
||||||
@router.get('/')
|
@router.get('/')
|
||||||
def status():
|
def status(request):
|
||||||
return {
|
return {
|
||||||
'status': 'ok'
|
'status': 'ok'
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,21 +1,21 @@
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from ninja import Schema
|
||||||
|
|
||||||
|
|
||||||
class ItemContent(BaseModel):
|
class ItemContent(Schema):
|
||||||
title: str
|
title: str
|
||||||
extract: str
|
extract: str
|
||||||
links: list[str]
|
links: list[str]
|
||||||
extra_links: Optional[list[str]]
|
extra_links: Optional[list[str]]
|
||||||
|
|
||||||
|
|
||||||
class ItemError(BaseModel):
|
class ItemError(Schema):
|
||||||
name: str
|
name: str
|
||||||
message: Optional[str]
|
message: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
class Item(BaseModel):
|
class Item(Schema):
|
||||||
url: str
|
url: str
|
||||||
status: Optional[int]
|
status: Optional[int]
|
||||||
timestamp: int
|
timestamp: int
|
||||||
|
@ -23,16 +23,16 @@ class Item(BaseModel):
|
||||||
error: Optional[ItemError]
|
error: Optional[ItemError]
|
||||||
|
|
||||||
|
|
||||||
class Batch(BaseModel):
|
class Batch(Schema):
|
||||||
user_id: str
|
user_id: str
|
||||||
items: list[Item]
|
items: list[Item]
|
||||||
|
|
||||||
|
|
||||||
class NewBatchRequest(BaseModel):
|
class NewBatchRequest(Schema):
|
||||||
user_id: str
|
user_id: str
|
||||||
|
|
||||||
|
|
||||||
class HashedBatch(BaseModel):
|
class HashedBatch(Schema):
|
||||||
user_id_hash: str
|
user_id_hash: str
|
||||||
timestamp: int
|
timestamp: int
|
||||||
items: list[Item]
|
items: list[Item]
|
||||||
|
|
|
@ -1,16 +1,13 @@
|
||||||
"""
|
"""
|
||||||
Database storing info on URLs
|
Database storing info on URLs
|
||||||
"""
|
"""
|
||||||
import random
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
|
|
||||||
from psycopg2.extras import execute_values
|
from psycopg2.extras import execute_values
|
||||||
|
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
|
||||||
from mwmbl.settings import CORE_DOMAINS
|
|
||||||
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
||||||
from mwmbl.utils import batch
|
from mwmbl.utils import batch
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@ import os
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import NamedTemporaryFile
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
"""
|
"""
|
||||||
Create a search index
|
Create a search index
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import pandas as pd
|
from mwmbl.tinysearchengine.indexer import TokenizedDocument
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
|
|
||||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||||
|
|
||||||
DEFAULT_SCORE = 0
|
DEFAULT_SCORE = 0
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
import re
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from multiprocessing import Queue
|
from multiprocessing import Queue
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Iterable, Collection
|
from typing import Collection
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from requests_cache import CachedSession
|
from requests_cache import CachedSession
|
||||||
|
|
|
@ -1,96 +1,8 @@
|
||||||
import argparse
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
from multiprocessing import Process, Queue
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI
|
|
||||||
from starlette.middleware.cors import CORSMiddleware
|
|
||||||
|
|
||||||
from mwmbl import background
|
|
||||||
from mwmbl.crawler import app as crawler
|
|
||||||
from mwmbl.indexer.batch_cache import BatchCache
|
|
||||||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
|
||||||
from mwmbl.platform import user
|
|
||||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
|
||||||
from mwmbl.tinysearchengine import search
|
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
|
||||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
|
||||||
from mwmbl.url_queue import update_queue_continuously
|
|
||||||
|
|
||||||
FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s'
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT)
|
|
||||||
|
|
||||||
|
|
||||||
MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
|
|
||||||
|
|
||||||
|
|
||||||
def setup_args():
|
|
||||||
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
|
|
||||||
parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
|
|
||||||
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
|
|
||||||
parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000)
|
|
||||||
parser.add_argument("--background", help="Enable running the background tasks to process batches",
|
|
||||||
action='store_true')
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
args = setup_args()
|
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
|
||||||
|
|
||||||
index_path = Path(args.data) / INDEX_NAME
|
|
||||||
try:
|
|
||||||
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
|
|
||||||
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
|
|
||||||
raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages "
|
|
||||||
f"({existing_index.num_pages}) do not match")
|
|
||||||
except FileNotFoundError:
|
|
||||||
print("Creating a new index")
|
|
||||||
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
|
|
||||||
|
|
||||||
new_item_queue = Queue()
|
|
||||||
queued_batches = Queue()
|
|
||||||
# curation_queue = Queue()
|
|
||||||
|
|
||||||
if args.background:
|
|
||||||
Process(target=background.run, args=(args.data,)).start()
|
|
||||||
Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start()
|
|
||||||
Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start()
|
|
||||||
|
|
||||||
completer = Completer()
|
|
||||||
|
|
||||||
with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index:
|
|
||||||
ranker = HeuristicRanker(tiny_index, completer)
|
|
||||||
# model = pickle.load(open(MODEL_PATH, 'rb'))
|
|
||||||
# ranker = LTRRanker(model, tiny_index, completer)
|
|
||||||
|
|
||||||
# Initialize FastApi instance
|
|
||||||
app = FastAPI()
|
|
||||||
|
|
||||||
# Try disabling since this is handled by nginx
|
|
||||||
# app.add_middleware(
|
|
||||||
# CORSMiddleware,
|
|
||||||
# allow_origins=["*"],
|
|
||||||
# allow_credentials=True,
|
|
||||||
# allow_methods=["*"],
|
|
||||||
# allow_headers=["*"],
|
|
||||||
# )
|
|
||||||
|
|
||||||
search_router = search.create_router(ranker)
|
|
||||||
app.include_router(search_router)
|
|
||||||
|
|
||||||
batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME)
|
|
||||||
crawler_router = crawler.get_router(batch_cache, queued_batches)
|
|
||||||
app.include_router(crawler_router)
|
|
||||||
|
|
||||||
user_router = user.create_router(index_path)
|
|
||||||
app.include_router(user_router)
|
|
||||||
|
|
||||||
# Initialize uvicorn server using global app instance and server config params
|
|
||||||
uvicorn.run(app, host="0.0.0.0", port=args.port)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -7,7 +7,7 @@ import requests
|
||||||
from fastapi import APIRouter, Response
|
from fastapi import APIRouter, Response
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||||
from mwmbl.tokenizer import tokenize
|
from mwmbl.tokenizer import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
|
125
mwmbl/settings_common.py
Normal file
125
mwmbl/settings_common.py
Normal file
|
@ -0,0 +1,125 @@
|
||||||
|
"""
|
||||||
|
Django settings for mwmbl project.
|
||||||
|
|
||||||
|
Generated by 'django-admin startproject' using Django 4.2.4.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/topics/settings/
|
||||||
|
|
||||||
|
For the full list of settings and their values, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/ref/settings/
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
|
||||||
|
|
||||||
|
# Quick-start development settings - unsuitable for production
|
||||||
|
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
||||||
|
|
||||||
|
# SECURITY WARNING: keep the secret key used in production secret!
|
||||||
|
SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm'
|
||||||
|
|
||||||
|
# SECURITY WARNING: don't run with debug turned on in production!
|
||||||
|
DEBUG = True
|
||||||
|
|
||||||
|
ALLOWED_HOSTS = []
|
||||||
|
|
||||||
|
|
||||||
|
# Application definition
|
||||||
|
|
||||||
|
INSTALLED_APPS = [
|
||||||
|
'django.contrib.admin',
|
||||||
|
'django.contrib.auth',
|
||||||
|
'django.contrib.contenttypes',
|
||||||
|
'django.contrib.sessions',
|
||||||
|
'django.contrib.messages',
|
||||||
|
'django.contrib.staticfiles',
|
||||||
|
'mwmbl',
|
||||||
|
]
|
||||||
|
|
||||||
|
MIDDLEWARE = [
|
||||||
|
'django.middleware.security.SecurityMiddleware',
|
||||||
|
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||||
|
'django.middleware.common.CommonMiddleware',
|
||||||
|
'django.middleware.csrf.CsrfViewMiddleware',
|
||||||
|
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||||
|
'django.contrib.messages.middleware.MessageMiddleware',
|
||||||
|
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||||
|
]
|
||||||
|
|
||||||
|
ROOT_URLCONF = 'mwmbl.urls'
|
||||||
|
|
||||||
|
TEMPLATES = [
|
||||||
|
{
|
||||||
|
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||||
|
'DIRS': [],
|
||||||
|
'APP_DIRS': True,
|
||||||
|
'OPTIONS': {
|
||||||
|
'context_processors': [
|
||||||
|
'django.template.context_processors.debug',
|
||||||
|
'django.template.context_processors.request',
|
||||||
|
'django.contrib.auth.context_processors.auth',
|
||||||
|
'django.contrib.messages.context_processors.messages',
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
WSGI_APPLICATION = 'mwmbl.wsgi.application'
|
||||||
|
|
||||||
|
|
||||||
|
# Database
|
||||||
|
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
|
||||||
|
|
||||||
|
DATABASES = {
|
||||||
|
'default': {
|
||||||
|
'ENGINE': 'django.db.backends.sqlite3',
|
||||||
|
'NAME': BASE_DIR / 'db.sqlite3',
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Password validation
|
||||||
|
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
||||||
|
|
||||||
|
AUTH_PASSWORD_VALIDATORS = [
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# Internationalization
|
||||||
|
# https://docs.djangoproject.com/en/4.2/topics/i18n/
|
||||||
|
|
||||||
|
LANGUAGE_CODE = 'en-us'
|
||||||
|
|
||||||
|
TIME_ZONE = 'UTC'
|
||||||
|
|
||||||
|
USE_I18N = True
|
||||||
|
|
||||||
|
USE_TZ = True
|
||||||
|
|
||||||
|
|
||||||
|
# Static files (CSS, JavaScript, Images)
|
||||||
|
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
||||||
|
|
||||||
|
STATIC_URL = 'static/'
|
||||||
|
|
||||||
|
# Default primary key field type
|
||||||
|
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
||||||
|
|
||||||
|
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||||
|
|
5
mwmbl/settings_dev.py
Normal file
5
mwmbl/settings_dev.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from mwmbl.settings_common import *
|
||||||
|
|
||||||
|
DATA_PATH = "./devdata"
|
||||||
|
RUN_BACKGROUND_PROCESSES = False
|
||||||
|
NUM_PAGES = 2560
|
5
mwmbl/settings_prod.py
Normal file
5
mwmbl/settings_prod.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
from mwmbl.settings_common import *
|
||||||
|
|
||||||
|
DATA_PATH = "/app/storage"
|
||||||
|
RUN_BACKGROUND_PROCESSES = True
|
||||||
|
NUM_PAGES = 10240000
|
|
@ -6,7 +6,6 @@ from operator import itemgetter
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||||
from mwmbl.platform.user import MAX_CURATED_SCORE
|
|
||||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
|
|
||||||
from fastapi import APIRouter
|
from ninja import Router
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||||
|
|
||||||
|
@ -10,15 +10,15 @@ logger = getLogger(__name__)
|
||||||
SCORE_THRESHOLD = 0.25
|
SCORE_THRESHOLD = 0.25
|
||||||
|
|
||||||
|
|
||||||
def create_router(ranker: HeuristicRanker) -> APIRouter:
|
def create_router(ranker: HeuristicRanker) -> Router:
|
||||||
router = APIRouter(prefix="/search", tags=["search"])
|
router = Router(tags=["search"])
|
||||||
|
|
||||||
@router.get("")
|
@router.get("")
|
||||||
def search(s: str):
|
def search(request, s: str):
|
||||||
return ranker.search(s)
|
return ranker.search(s)
|
||||||
|
|
||||||
@router.get("/complete")
|
@router.get("/complete")
|
||||||
def complete(q: str):
|
def complete(request, q: str):
|
||||||
return ranker.complete(q)
|
return ranker.complete(q)
|
||||||
|
|
||||||
return router
|
return router
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from multiprocessing import Queue
|
from multiprocessing import Queue
|
||||||
|
|
25
mwmbl/urls.py
Normal file
25
mwmbl/urls.py
Normal file
|
@ -0,0 +1,25 @@
|
||||||
|
"""
|
||||||
|
URL configuration for app project.
|
||||||
|
|
||||||
|
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||||
|
https://docs.djangoproject.com/en/4.2/topics/http/urls/
|
||||||
|
Examples:
|
||||||
|
Function views
|
||||||
|
1. Add an import: from my_app import views
|
||||||
|
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||||
|
Class-based views
|
||||||
|
1. Add an import: from other_app.views import Home
|
||||||
|
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||||
|
Including another URLconf
|
||||||
|
1. Import the include() function: from django.urls import include, path
|
||||||
|
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||||
|
"""
|
||||||
|
from django.contrib import admin
|
||||||
|
from django.urls import path
|
||||||
|
|
||||||
|
from mwmbl.api import api
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
path('admin/', admin.site.urls),
|
||||||
|
path('', api.urls)
|
||||||
|
]
|
16
mwmbl/wsgi.py
Normal file
16
mwmbl/wsgi.py
Normal file
|
@ -0,0 +1,16 @@
|
||||||
|
"""
|
||||||
|
WSGI config for app project.
|
||||||
|
|
||||||
|
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||||
|
|
||||||
|
For more information on this file, see
|
||||||
|
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from django.core.wsgi import get_wsgi_application
|
||||||
|
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
|
||||||
|
|
||||||
|
application = get_wsgi_application()
|
|
@ -33,6 +33,8 @@ langdetect = {version= "==1.0.9", optional = true}
|
||||||
pyarrow = {version= "==6.0.0", optional = true}
|
pyarrow = {version= "==6.0.0", optional = true}
|
||||||
pyspark = {version= "==3.2.0", optional = true}
|
pyspark = {version= "==3.2.0", optional = true}
|
||||||
Levenshtein = {version= "==0.16.0", optional = true}
|
Levenshtein = {version= "==0.16.0", optional = true}
|
||||||
|
django = "^4.2.4"
|
||||||
|
django-ninja = "^0.22.2"
|
||||||
requests-cache = "^1.1.0"
|
requests-cache = "^1.1.0"
|
||||||
redis = {extras = ["hiredis"], version = "^5.0.1"}
|
redis = {extras = ["hiredis"], version = "^5.0.1"}
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,3 @@
|
||||||
import mwmbl.tinysearchengine.completer
|
|
||||||
import pytest
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
def mockCompleterData(mocker, data):
|
def mockCompleterData(mocker, data):
|
||||||
|
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('build')
|
completion = completer.complete('build')
|
||||||
assert ['build', 'builder', 'buildings'] == completion
|
assert ['build', 'builder', 'buildings'] == completion
|
||||||
|
|
||||||
|
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
|
||||||
[3, 'buildings', 3]]
|
[3, 'buildings', 3]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('build')
|
completion = completer.complete('build')
|
||||||
assert ['build', 'buildings', 'builder'] == completion
|
assert ['build', 'buildings', 'builder'] == completion
|
||||||
|
|
||||||
|
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('test')
|
completion = completer.complete('test')
|
||||||
assert [] == completion
|
assert [] == completion
|
||||||
|
|
||||||
|
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('announce')
|
completion = completer.complete('announce')
|
||||||
assert ['announce'] == completion
|
assert ['announce'] == completion
|
||||||
|
|
||||||
|
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
print(f"iteration: {i}")
|
print(f"iteration: {i}")
|
||||||
completion = completer.complete('build')
|
completion = completer.complete('build')
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
|
from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
|
||||||
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
from zstandard import ZstdCompressor
|
||||||
import json
|
|
||||||
|
|
||||||
def test_create_index():
|
def test_create_index():
|
||||||
num_pages = 10
|
num_pages = 10
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from mwmbl.indexer.update_urls import process_link
|
from mwmbl.indexer import process_link
|
||||||
|
|
||||||
|
|
||||||
def test_process_link_normal():
|
def test_process_link_normal():
|
||||||
|
|
Loading…
Reference in a new issue