From 4aefc48716b1b9079625828153b6b9a77b05a957 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 27 Aug 2023 07:37:15 +0100 Subject: [PATCH 1/8] Add django --- app/__init__.py | 0 app/asgi.py | 16 +++++++ app/settings.py | 123 ++++++++++++++++++++++++++++++++++++++++++++++++ app/urls.py | 22 +++++++++ app/wsgi.py | 16 +++++++ manage.py | 22 +++++++++ poetry.lock | 54 ++++++++++++++++++++- pyproject.toml | 1 + 8 files changed, 252 insertions(+), 2 deletions(-) create mode 100644 app/__init__.py create mode 100644 app/asgi.py create mode 100644 app/settings.py create mode 100644 app/urls.py create mode 100644 app/wsgi.py create mode 100755 manage.py diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/asgi.py b/app/asgi.py new file mode 100644 index 0000000..c8d5aaa --- /dev/null +++ b/app/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for app project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') + +application = get_asgi_application() diff --git a/app/settings.py b/app/settings.py new file mode 100644 index 0000000..a067541 --- /dev/null +++ b/app/settings.py @@ -0,0 +1,123 @@ +""" +Django settings for app project. + +Generated by 'django-admin startproject' using Django 4.2.4. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/4.2/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-qqr#f(i3uf%m8%8u35vn=ov-uk(*8!a&1t-hxa%ev2^t1%j&sm' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'app.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'app.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/4.2/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/4.2/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/4.2/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' diff --git a/app/urls.py b/app/urls.py new file mode 100644 index 0000000..84b3189 --- /dev/null +++ b/app/urls.py @@ -0,0 +1,22 @@ +""" +URL configuration for app project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/4.2/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +urlpatterns = [ + path('admin/', admin.site.urls), +] diff --git a/app/wsgi.py b/app/wsgi.py new file mode 100644 index 0000000..ef30895 --- /dev/null +++ b/app/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for app project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') + +application = get_wsgi_application() diff --git a/manage.py b/manage.py new file mode 100755 index 0000000..4931389 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/poetry.lock b/poetry.lock index b364b25..f76f627 100644 --- a/poetry.lock +++ b/poetry.lock @@ -387,6 +387,27 @@ files = [ {file = "cymem-2.0.7.tar.gz", hash = "sha256:e6034badb5dd4e10344211c81f16505a55553a7164adc314c75bd80cf07e57a8"}, ] +[[package]] +name = "django" +version = "4.2.4" +description = "A high-level Python web framework that encourages rapid development and clean, pragmatic design." +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "Django-4.2.4-py3-none-any.whl", hash = "sha256:860ae6a138a238fc4f22c99b52f3ead982bb4b1aad8c0122bcd8c8a3a02e409d"}, + {file = "Django-4.2.4.tar.gz", hash = "sha256:7e4225ec065e0f354ccf7349a22d209de09cc1c074832be9eb84c51c1799c432"}, +] + +[package.dependencies] +asgiref = ">=3.6.0,<4" +sqlparse = ">=0.3.1" +tzdata = {version = "*", markers = "sys_platform == \"win32\""} + +[package.extras] +argon2 = ["argon2-cffi (>=19.1.0)"] +bcrypt = ["bcrypt"] + [[package]] name = "exceptiongroup" version = "1.1.1" @@ -1687,6 +1708,23 @@ files = [ {file = "spacy_loggers-1.0.4-py3-none-any.whl", hash = "sha256:e050bf2e63208b2f096b777e494971c962ad7c1dc997641c8f95c622550044ae"}, ] +[[package]] +name = "sqlparse" +version = "0.4.4" +description = "A non-validating SQL parser." +category = "main" +optional = false +python-versions = ">=3.5" +files = [ + {file = "sqlparse-0.4.4-py3-none-any.whl", hash = "sha256:5430a4fe2ac7d0f93e66f1efc6e1338a41884b7ddf2a350cedd20ccc4d9d28f3"}, + {file = "sqlparse-0.4.4.tar.gz", hash = "sha256:d446183e84b8349fa3061f0fe7f06ca94ba65b426946ffebe6e3e8295332420c"}, +] + +[package.extras] +dev = ["build", "flake8"] +doc = ["sphinx"] +test = ["pytest", "pytest-cov"] + [[package]] name = "srsly" version = "2.4.6" @@ -1890,6 +1928,18 @@ files = [ {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, ] +[[package]] +name = "tzdata" +version = "2023.3" +description = "Provider of IANA time zone data" +category = "main" +optional = false +python-versions = ">=2" +files = [ + {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, + {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, +] + [[package]] name = "ujson" version = "4.3.0" @@ -2069,9 +2119,9 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein"] +indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "pyarrow", "pyspark", "Levenshtein"] [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "65eb26c5a9dda0e504632fa433334157fe5c1cbe19135ab18d59c74f47249a0b" +content-hash = "b40fcb8fc85427fe91b9ba334eaa21bc58c746471650121fbf0846c859c6999e" diff --git a/pyproject.toml b/pyproject.toml index 2def9aa..ac3b8af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,6 +33,7 @@ langdetect = {version= "==1.0.9", optional = true} pyarrow = {version= "==6.0.0", optional = true} pyspark = {version= "==3.2.0", optional = true} Levenshtein = {version= "==0.16.0", optional = true} +django = "^4.2.4" [tool.poetry.extras] indexer = [ From 19cc196e346bfdc5255dd999552f8ae5e492c83e Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Fri, 22 Sep 2023 19:56:42 +0100 Subject: [PATCH 2/8] Add django ninja --- app/api.py | 8 ++++++++ app/urls.py | 3 +++ poetry.lock | 23 ++++++++++++++++++++++- pyproject.toml | 1 + 4 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 app/api.py diff --git a/app/api.py b/app/api.py new file mode 100644 index 0000000..539f200 --- /dev/null +++ b/app/api.py @@ -0,0 +1,8 @@ +from ninja import NinjaAPI + +api = NinjaAPI(version="1.0.0") + + +@api.get("/hello") +def hello(request): + return {"response": "Hello world"} diff --git a/app/urls.py b/app/urls.py index 84b3189..6338358 100644 --- a/app/urls.py +++ b/app/urls.py @@ -17,6 +17,9 @@ Including another URLconf from django.contrib import admin from django.urls import path +from app.api import api + urlpatterns = [ path('admin/', admin.site.urls), + path('api/v1/', api.urls) ] diff --git a/poetry.lock b/poetry.lock index f76f627..d9f91b0 100644 --- a/poetry.lock +++ b/poetry.lock @@ -408,6 +408,27 @@ tzdata = {version = "*", markers = "sys_platform == \"win32\""} argon2 = ["argon2-cffi (>=19.1.0)"] bcrypt = ["bcrypt"] +[[package]] +name = "django-ninja" +version = "0.22.2" +description = "Django Ninja - Fast Django REST framework" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "django_ninja-0.22.2-py3-none-any.whl", hash = "sha256:c53b098a8190f373ea2605c276a6061d48b2626500a9c6b9916c503e4b4a20eb"}, + {file = "django_ninja-0.22.2.tar.gz", hash = "sha256:913ebde7571d6a6968c9ac0b9e8a24680c46444d44fdd552f8831dbeede1292c"}, +] + +[package.dependencies] +Django = ">=2.2" +pydantic = ">=1.6,<2.0.0" + +[package.extras] +dev = ["pre-commit"] +doc = ["markdown-include", "mkdocs", "mkdocs-material", "mkdocstrings"] +test = ["black", "django-stubs", "flake8", "isort", "mypy (==0.931)", "psycopg2-binary", "pytest", "pytest-asyncio", "pytest-cov", "pytest-django"] + [[package]] name = "exceptiongroup" version = "1.1.1" @@ -2124,4 +2145,4 @@ indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "b40fcb8fc85427fe91b9ba334eaa21bc58c746471650121fbf0846c859c6999e" +content-hash = "9e8a04eb23e361a1493232289085251695eaeb08e165a36ad2ceeed03ad825cb" diff --git a/pyproject.toml b/pyproject.toml index ac3b8af..455d5b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ pyarrow = {version= "==6.0.0", optional = true} pyspark = {version= "==3.2.0", optional = true} Levenshtein = {version= "==0.16.0", optional = true} django = "^4.2.4" +django-ninja = "^0.22.2" [tool.poetry.extras] indexer = [ From 86a6524f0ab69b4ca4020b5e12f7fb3d07e6dfd5 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 24 Sep 2023 08:09:18 +0100 Subject: [PATCH 3/8] WIP add search API to Django --- app/api.py | 20 ++++++++++++++++++++ app/settings.py | 6 ++++++ mwmbl/tinysearchengine/search.py | 6 +++--- 3 files changed, 29 insertions(+), 3 deletions(-) diff --git a/app/api.py b/app/api.py index 539f200..2c1f4e5 100644 --- a/app/api.py +++ b/app/api.py @@ -1,7 +1,27 @@ +from pathlib import Path + from ninja import NinjaAPI +from app import settings +from mwmbl.indexer.paths import INDEX_NAME +from mwmbl.tinysearchengine import search +from mwmbl.tinysearchengine.completer import Completer +from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine.rank import HeuristicRanker + api = NinjaAPI(version="1.0.0") +index_path = Path(settings.DATA_PATH) / INDEX_NAME +tiny_index = TinyIndex(item_factory=Document, index_path=index_path) +tiny_index.__enter__() + +completer = Completer() +ranker = HeuristicRanker(tiny_index, completer) + +search_router = search.create_router(ranker) + +api.add_router("/search/", search_router) + @api.get("/hello") def hello(request): diff --git a/app/settings.py b/app/settings.py index a067541..f483d35 100644 --- a/app/settings.py +++ b/app/settings.py @@ -121,3 +121,9 @@ STATIC_URL = 'static/' # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' + +# ===================== Custom Settings ========================= + +DATA_PATH = "./devdata" + + diff --git a/mwmbl/tinysearchengine/search.py b/mwmbl/tinysearchengine/search.py index f9ecace..bd8e54a 100644 --- a/mwmbl/tinysearchengine/search.py +++ b/mwmbl/tinysearchengine/search.py @@ -1,6 +1,6 @@ from logging import getLogger -from fastapi import APIRouter +from ninja import Router from mwmbl.tinysearchengine.rank import HeuristicRanker @@ -10,8 +10,8 @@ logger = getLogger(__name__) SCORE_THRESHOLD = 0.25 -def create_router(ranker: HeuristicRanker) -> APIRouter: - router = APIRouter(prefix="/search", tags=["search"]) +def create_router(ranker: HeuristicRanker) -> Router: + router = Router(tags=["search"]) @router.get("") def search(s: str): From db658daa883479b0d62025d493823b179eae3637 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Thu, 28 Sep 2023 17:48:29 +0100 Subject: [PATCH 4/8] Store stats in redis --- mwmbl/crawler/stats.py | 77 ++++++++++++++++++++++++ poetry.lock | 133 ++++++++++++++++++++++++++++++++++++++++- pyproject.toml | 1 + 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 mwmbl/crawler/stats.py diff --git a/mwmbl/crawler/stats.py b/mwmbl/crawler/stats.py new file mode 100644 index 0000000..c71973c --- /dev/null +++ b/mwmbl/crawler/stats.py @@ -0,0 +1,77 @@ +import gzip +import json +from datetime import datetime +from glob import glob +from itertools import islice +from logging import getLogger +from urllib.parse import urlparse + +from redis import Redis + +from mwmbl.crawler.batch import HashedBatch +from mwmbl.indexer.update_urls import get_datetime_from_timestamp + +logger = getLogger(__name__) + +URL_DATE_COUNT_KEY = "url-count-{date}" +URL_HOUR_COUNT_KEY = "url-count-hour-{hour}" +USER_COUNT_KEY = "user-count-{date}" +HOST_COUNT_KEY = "host-count-{date}" +EXPIRE_SECONDS = 60*60*24 + + +class StatsManager: + def __init__(self, redis: Redis): + self.redis = redis + + def record_batch(self, hashed_batch: HashedBatch): + date_time = get_datetime_from_timestamp(hashed_batch.timestamp) + + num_crawled_urls = sum(1 for item in hashed_batch.items if item.content is not None) + + url_count_key = URL_DATE_COUNT_KEY.format(date=date_time.date) + self.redis.incrby(url_count_key, num_crawled_urls) + self.redis.expire(url_count_key, EXPIRE_SECONDS) + + hour = datetime(date_time.year, date_time.month, date_time.day, date_time.hour) + hour_key = URL_HOUR_COUNT_KEY.format(hour=hour) + self.redis.incrby(hour_key, num_crawled_urls) + self.redis.expire(hour_key, EXPIRE_SECONDS) + + user_count_key = USER_COUNT_KEY.format(date=date_time.date) + self.redis.zincrby(user_count_key, num_crawled_urls, hashed_batch.user_id_hash) + self.redis.expire(user_count_key, EXPIRE_SECONDS) + + host_key = HOST_COUNT_KEY.format(date=date_time.date) + for item in hashed_batch.items: + if item.content is None: + continue + + host = urlparse(item.url).netloc + self.redis.zincrby(host_key, 1, host) + self.redis.expire(host_key, EXPIRE_SECONDS) + + def get_stats(self): + pass + + +def get_test_batches(): + for path in glob("./devdata/batches/**/*.json.gz", recursive=True): + print("Processing path", path) + with gzip.open(path) as gzip_file: + yield HashedBatch.parse_raw(gzip_file.read()) + + +if __name__ == '__main__': + redis = Redis(host='localhost', port=6379, decode_responses=True) + stats = StatsManager(redis) + batches = get_test_batches() + start = datetime.now() + processed = 0 + for batch in islice(batches, 100): + stats.record_batch(batch) + processed += 1 + total_time = (datetime.now() - start).total_seconds() + print("Processed", processed) + print("Total time", total_time) + print("Time per batch", total_time/processed) diff --git a/poetry.lock b/poetry.lock index 6c15500..f2e3102 100644 --- a/poetry.lock +++ b/poetry.lock @@ -40,6 +40,18 @@ typing-extensions = {version = ">=4", markers = "python_version < \"3.11\""} [package.extras] tests = ["mypy (>=0.800)", "pytest", "pytest-asyncio"] +[[package]] +name = "async-timeout" +version = "4.0.3" +description = "Timeout context manager for asyncio programs" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f"}, + {file = "async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028"}, +] + [[package]] name = "attrs" version = "23.1.0" @@ -538,6 +550,105 @@ files = [ {file = "h11-0.14.0.tar.gz", hash = "sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d"}, ] +[[package]] +name = "hiredis" +version = "2.2.3" +description = "Python wrapper for hiredis" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "hiredis-2.2.3-cp310-cp310-macosx_10_12_universal2.whl", hash = "sha256:9a1a80a8fa767f2fdc3870316a54b84fe9fc09fa6ab6a2686783de6a228a4604"}, + {file = "hiredis-2.2.3-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3f006c28c885deb99b670a5a66f367a175ab8955b0374029bad7111f5357dcd4"}, + {file = "hiredis-2.2.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffaf841546905d90ff189de7397aa56413b1ce5e54547f17a98f0ebf3a3b0a3b"}, + {file = "hiredis-2.2.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cadb0ac7ba3babfd804e425946bec9717b320564a1390f163a54af9365a720a"}, + {file = "hiredis-2.2.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:33bc4721632ef9708fa44e5df0066053fccc8e65410a2c48573192517a533b48"}, + {file = "hiredis-2.2.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:227c5b4bcb60f89008c275d596e4a7b6625a6b3c827b8a66ae582eace7051f71"}, + {file = "hiredis-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:61995eb826009d99ed8590747bc0da683a5f4fbb4faa8788166bf3810845cd5c"}, + {file = "hiredis-2.2.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6f969edc851efe23010e0f53a64269f2629a9364135e9ec81c842e8b2277d0c1"}, + {file = "hiredis-2.2.3-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d27e560eefb57914d742a837f1da98d3b29cb22eff013c8023b7cf52ae6e051d"}, + {file = "hiredis-2.2.3-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:3759f4789ae1913b7df278dfc9e8749205b7a106f888cd2903d19461e24a7697"}, + {file = "hiredis-2.2.3-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:c6cb613148422c523945cdb8b6bed617856f2602fd8750e33773ede2616e55d5"}, + {file = "hiredis-2.2.3-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:1d274d5c511dfc03f83f997d3238eaa9b6ee3f982640979f509373cced891e98"}, + {file = "hiredis-2.2.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:3b7fe075e91b9d9cff40eba4fb6a8eff74964d3979a39be9a9ef58b1b4cb3604"}, + {file = "hiredis-2.2.3-cp310-cp310-win32.whl", hash = "sha256:77924b0d32fd1f493d3df15d9609ddf9d94c31a364022a6bf6b525ce9da75bea"}, + {file = "hiredis-2.2.3-cp310-cp310-win_amd64.whl", hash = "sha256:dcb0569dd5bfe6004658cd0f229efa699a3169dcb4f77bd72e188adda302063d"}, + {file = "hiredis-2.2.3-cp311-cp311-macosx_10_12_universal2.whl", hash = "sha256:d115790f18daa99b5c11a506e48923b630ef712e9e4b40482af942c3d40638b8"}, + {file = "hiredis-2.2.3-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:4c3b8be557e08b234774925622e196f0ee36fe4eab66cd19df934d3efd8f3743"}, + {file = "hiredis-2.2.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:3f5446068197b35a11ccc697720c41879c8657e2e761aaa8311783aac84cef20"}, + {file = "hiredis-2.2.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa17a3b22b3726d54d7af20394f65d4a1735a842a4e0f557dc67a90f6965c4bc"}, + {file = "hiredis-2.2.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7df645b6b7800e8b748c217fbd6a4ca8361bcb9a1ae6206cc02377833ec8a1aa"}, + {file = "hiredis-2.2.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2fb9300959a0048138791f3d68359d61a788574ec9556bddf1fec07f2dbc5320"}, + {file = "hiredis-2.2.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2d7e459fe7313925f395148d36d9b7f4f8dac65be06e45d7af356b187cef65fc"}, + {file = "hiredis-2.2.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8eceffca3941775b646cd585cd19b275d382de43cc3327d22f7c75d7b003d481"}, + {file = "hiredis-2.2.3-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b17baf702c6e5b4bb66e1281a3efbb1d749c9d06cdb92b665ad81e03118f78fc"}, + {file = "hiredis-2.2.3-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4e43e2b5acaad09cf48c032f7e4926392bb3a3f01854416cf6d82ebff94d5467"}, + {file = "hiredis-2.2.3-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:a7205497d7276a81fe92951a29616ef96562ed2f91a02066f72b6f93cb34b40e"}, + {file = "hiredis-2.2.3-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:126623b03c31cb6ac3e0d138feb6fcc36dd43dd34fc7da7b7a0c38b5d75bc896"}, + {file = "hiredis-2.2.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:071c5814b850574036506a8118034f97c3cbf2fe9947ff45a27b07a48da56240"}, + {file = "hiredis-2.2.3-cp311-cp311-win32.whl", hash = "sha256:d1be9e30e675f5bc1cb534633324578f6f0944a1bcffe53242cf632f554f83b6"}, + {file = "hiredis-2.2.3-cp311-cp311-win_amd64.whl", hash = "sha256:b9a7c987e161e3c58f992c63b7e26fea7fe0777f3b975799d23d65bbb8cb5899"}, + {file = "hiredis-2.2.3-cp37-cp37m-macosx_10_12_x86_64.whl", hash = "sha256:f2dcb8389fa3d453927b1299f46bdb38473c293c8269d5c777d33ea0e526b610"}, + {file = "hiredis-2.2.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2df98f5e071320c7d84e8bd07c0542acdd0a7519307fc31774d60e4b842ec4f"}, + {file = "hiredis-2.2.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:61a72e4a523cdfc521762137559c08dfa360a3caef63620be58c699d1717dac1"}, + {file = "hiredis-2.2.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c9b9e5bde7030cae83aa900b5bd660decc65afd2db8c400f3c568c815a47ca2a"}, + {file = "hiredis-2.2.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd2614f17e261f72efc2f19f5e5ff2ee19e2296570c0dcf33409e22be30710de"}, + {file = "hiredis-2.2.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:46525fbd84523cac75af5bf524bc74aaac848beaf31b142d2df8a787d9b4bbc4"}, + {file = "hiredis-2.2.3-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d1a4ce40ba11da9382c14da31f4f9e88c18f7d294f523decd0fadfb81f51ad18"}, + {file = "hiredis-2.2.3-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:5cda592405bbd29d53942e0389dc3fa77b49c362640210d7e94a10c14a677d4d"}, + {file = "hiredis-2.2.3-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:5e6674a017629284ef373b50496d9fb1a89b85a20a7fa100ecd109484ec748e5"}, + {file = "hiredis-2.2.3-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:e62ec131816c6120eff40dffe43424e140264a15fa4ab88c301bd6a595913af3"}, + {file = "hiredis-2.2.3-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:17e938d9d3ee92e1adbff361706f1c36cc60eeb3e3eeca7a3a353eae344f4c91"}, + {file = "hiredis-2.2.3-cp37-cp37m-win32.whl", hash = "sha256:95d2305fd2a7b179cacb48b10f618872fc565c175f9f62b854e8d1acac3e8a9e"}, + {file = "hiredis-2.2.3-cp37-cp37m-win_amd64.whl", hash = "sha256:8f9dbe12f011a9b784f58faecc171d22465bb532c310bd588d769ba79a59ef5a"}, + {file = "hiredis-2.2.3-cp38-cp38-macosx_10_12_universal2.whl", hash = "sha256:5a4bcef114fc071d5f52c386c47f35aae0a5b43673197b9288a15b584da8fa3a"}, + {file = "hiredis-2.2.3-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:232d0a70519865741ba56e1dfefd160a580ae78c30a1517bad47b3cf95a3bc7d"}, + {file = "hiredis-2.2.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9076ce8429785c85f824650735791738de7143f61f43ae9ed83e163c0ca0fa44"}, + {file = "hiredis-2.2.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec58fb7c2062f835595c12f0f02dcda76d0eb0831423cc191d1e18c9276648de"}, + {file = "hiredis-2.2.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7f2b34a6444b8f9c1e9f84bd2c639388e5d14f128afd14a869dfb3d9af893aa2"}, + {file = "hiredis-2.2.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:818dfd310aa1020a13cd08ee48e116dd8c3bb2e23b8161f8ac4df587dd5093d7"}, + {file = "hiredis-2.2.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96d9ea6c8d4cbdeee2e0d43379ce2881e4af0454b00570677c59f33f2531cd38"}, + {file = "hiredis-2.2.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f1eadbcd3de55ac42310ff82550d3302cb4efcd4e17d76646a17b6e7004bb42b"}, + {file = "hiredis-2.2.3-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:477c34c4489666dc73cb5e89dafe2617c3e13da1298917f73d55aac4696bd793"}, + {file = "hiredis-2.2.3-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:14824e457e4f5cda685c3345d125da13949bcf3bb1c88eb5d248c8d2c3dee08f"}, + {file = "hiredis-2.2.3-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:9cd32326dfa6ce87edf754153b0105aca64486bebe93b9600ccff74fa0b224df"}, + {file = "hiredis-2.2.3-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:51341e70b467004dcbec3a6ce8c478d2d6241e0f6b01e4c56764afd5022e1e9d"}, + {file = "hiredis-2.2.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2443659c76b226267e2a04dbbb21bc2a3f91aa53bdc0c22964632753ae43a247"}, + {file = "hiredis-2.2.3-cp38-cp38-win32.whl", hash = "sha256:4e3e3e31423f888d396b1fc1f936936e52af868ac1ec17dd15e3eeba9dd4de24"}, + {file = "hiredis-2.2.3-cp38-cp38-win_amd64.whl", hash = "sha256:20f509e3a1a20d6e5f5794fc37ceb21f70f409101fcfe7a8bde783894d51b369"}, + {file = "hiredis-2.2.3-cp39-cp39-macosx_10_12_universal2.whl", hash = "sha256:d20891e3f33803b26d54c77fd5745878497091e33f4bbbdd454cf6e71aee8890"}, + {file = "hiredis-2.2.3-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:50171f985e17970f87d5a29e16603d1e5b03bdbf5c2691a37e6c912942a6b657"}, + {file = "hiredis-2.2.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9944a2cac25ffe049a7e89f306e11b900640837d1ef38d9be0eaa4a4e2b73a52"}, + {file = "hiredis-2.2.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a5c8019ff94988d56eb49b15de76fe83f6b42536d76edeb6565dbf7fe14b973"}, + {file = "hiredis-2.2.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a286ded34eb16501002e3713b3130c987366eee2ba0d58c33c72f27778e31676"}, + {file = "hiredis-2.2.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e974ad15eb32b1f537730dea70b93a4c3db7b026de3ad2b59da49c6f7454d"}, + {file = "hiredis-2.2.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:08415ea74c1c29b9d6a4ca3dd0e810dc1af343c1d1d442e15ba133b11ab5be6a"}, + {file = "hiredis-2.2.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7e17d04ea58ab8cf3f2dc52e875db16077c6357846006780086fff3189fb199d"}, + {file = "hiredis-2.2.3-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6ccdcb635dae85b006592f78e32d97f4bc7541cb27829d505f9c7fefcef48298"}, + {file = "hiredis-2.2.3-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:69536b821dd1bc78058a6e7541743f8d82bf2d981b91280b14c4daa6cdc7faba"}, + {file = "hiredis-2.2.3-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:3753df5f873d473f055e1f8837bfad0bd3b277c86f3c9bf058c58f14204cd901"}, + {file = "hiredis-2.2.3-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6f88cafe46612b6fa68e6dea49e25bebf160598bba00101caa51cc8c1f18d597"}, + {file = "hiredis-2.2.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:33ee3ea5cad3a8cb339352cd230b411eb437a2e75d7736c4899acab32056ccdb"}, + {file = "hiredis-2.2.3-cp39-cp39-win32.whl", hash = "sha256:b4f3d06dc16671b88a13ae85d8ca92534c0b637d59e49f0558d040a691246422"}, + {file = "hiredis-2.2.3-cp39-cp39-win_amd64.whl", hash = "sha256:4f674e309cd055ee7a48304ceb8cf43265d859faf4d7d01d270ce45e976ae9d3"}, + {file = "hiredis-2.2.3-pp37-pypy37_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8f280ab4e043b089777b43b4227bdc2035f88da5072ab36588e0ccf77d45d058"}, + {file = "hiredis-2.2.3-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15c2a551f3b8a26f7940d6ee10b837810201754b8d7e6f6b1391655370882c5a"}, + {file = "hiredis-2.2.3-pp37-pypy37_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:60c4e3c258eafaab21b174b17270a0cc093718d61cdbde8c03f85ec4bf835343"}, + {file = "hiredis-2.2.3-pp37-pypy37_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cc36a9dded458d4e37492fe3e619c6c83caae794d26ad925adbce61d592f8428"}, + {file = "hiredis-2.2.3-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:4ed68a3b1ccb4313d2a42546fd7e7439ad4745918a48b6c9bcaa61e1e3e42634"}, + {file = "hiredis-2.2.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3bf4b5bae472630c229518e4a814b1b68f10a3d9b00aeaec45f1a330f03a0251"}, + {file = "hiredis-2.2.3-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:33a94d264e6e12a79d9bb8af333b01dc286b9f39c99072ab5fef94ce1f018e17"}, + {file = "hiredis-2.2.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3fa6811a618653164f918b891a0fa07052bd71a799defa5c44d167cac5557b26"}, + {file = "hiredis-2.2.3-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:af33f370be90b48bbaf0dab32decbdcc522b1fa95d109020a963282086518a8e"}, + {file = "hiredis-2.2.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b9953d87418ac228f508d93898ab572775e4d3b0eeb886a1a7734553bcdaf291"}, + {file = "hiredis-2.2.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5e7bb4dd524f50b71c20ef5a12bd61da9b463f8894b18a06130942fe31509881"}, + {file = "hiredis-2.2.3-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89a258424158eb8b3ed9f65548d68998da334ef155d09488c5637723eb1cd697"}, + {file = "hiredis-2.2.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f4a65276f6ecdebe75f2a53f578fbc40e8d2860658420d5e0611c56bbf5054c"}, + {file = "hiredis-2.2.3-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:334f2738700b20faa04a0d813366fb16ed17287430a6b50584161d5ad31ca6d7"}, + {file = "hiredis-2.2.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:d194decd9608f11c777946f596f31d5aacad13972a0a87829ae1e6f2d26c1885"}, + {file = "hiredis-2.2.3.tar.gz", hash = "sha256:e75163773a309e56a9b58165cf5a50e0f84b755f6ff863b2c01a38918fe92daa"}, +] + [[package]] name = "idna" version = "3.3" @@ -1536,6 +1647,26 @@ files = [ [package.extras] full = ["numpy"] +[[package]] +name = "redis" +version = "5.0.1" +description = "Python client for Redis database and key-value store" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "redis-5.0.1-py3-none-any.whl", hash = "sha256:ed4802971884ae19d640775ba3b03aa2e7bd5e8fb8dfaed2decce4d0fc48391f"}, + {file = "redis-5.0.1.tar.gz", hash = "sha256:0dab495cd5753069d3bc650a0dde8a8f9edde16fc5691b689a566eda58100d0f"}, +] + +[package.dependencies] +async-timeout = {version = ">=4.0.2", markers = "python_full_version <= \"3.11.2\""} +hiredis = {version = ">=1.0.0", optional = true, markers = "extra == \"hiredis\""} + +[package.extras] +hiredis = ["hiredis (>=1.0.0)"] +ocsp = ["cryptography (>=36.0.1)", "pyopenssl (==20.0.1)", "requests (>=2.26.0)"] + [[package]] name = "requests" version = "2.31.0" @@ -2299,4 +2430,4 @@ indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "langdetect", "p [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "0f7cb7135a2dd6ec11e37eb3bbc28071662f80a8ec40398db62c411992bd1b2a" +content-hash = "fe5f238c57ec2d09acb6bdf8f46f33c7bbe499f68a7e34ab7bca1336e0ae881c" diff --git a/pyproject.toml b/pyproject.toml index c52dd5f..4a4a725 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ Levenshtein = {version= "==0.16.0", optional = true} django = "^4.2.4" django-ninja = "^0.22.2" requests-cache = "^1.1.0" +redis = {extras = ["hiredis"], version = "^5.0.1"} [tool.poetry.extras] indexer = [ From b6fd27352bfd63498bca8b3eaa2cd959a587ecb9 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 8 Oct 2023 14:13:38 +0100 Subject: [PATCH 5/8] Add crawler router --- app/api.py | 13 ++++++++----- app/urls.py | 2 +- mwmbl/crawler/app.py | 33 ++++++++++++++++---------------- mwmbl/crawler/batch.py | 14 +++++++------- mwmbl/main.py | 2 +- mwmbl/tinysearchengine/search.py | 4 ++-- 6 files changed, 35 insertions(+), 33 deletions(-) diff --git a/app/api.py b/app/api.py index 2c1f4e5..af959e9 100644 --- a/app/api.py +++ b/app/api.py @@ -1,9 +1,12 @@ +from multiprocessing import Queue from pathlib import Path from ninja import NinjaAPI from app import settings -from mwmbl.indexer.paths import INDEX_NAME +import mwmbl.crawler.app as crawler +from mwmbl.indexer.batch_cache import BatchCache +from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME from mwmbl.tinysearchengine import search from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.indexer import TinyIndex, Document @@ -19,10 +22,10 @@ completer = Completer() ranker = HeuristicRanker(tiny_index, completer) search_router = search.create_router(ranker) - api.add_router("/search/", search_router) +batch_cache = BatchCache(Path(settings.DATA_PATH) / BATCH_DIR_NAME) -@api.get("/hello") -def hello(request): - return {"response": "Hello world"} +queued_batches = Queue() +crawler_router = crawler.create_router(batch_cache=batch_cache, queued_batches=queued_batches) +api.add_router("/crawler/", crawler_router) diff --git a/app/urls.py b/app/urls.py index 6338358..440a2f4 100644 --- a/app/urls.py +++ b/app/urls.py @@ -21,5 +21,5 @@ from app.api import api urlpatterns = [ path('admin/', admin.site.urls), - path('api/v1/', api.urls) + path('', api.urls) ] diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index dbfac92..bda2dc8 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -14,6 +14,7 @@ from fastapi import HTTPException, APIRouter from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor +from ninja import Router from redis import Redis from mwmbl.crawler.batch import Batch, NewBatchRequest, HashedBatch @@ -82,17 +83,15 @@ def justext_with_dom(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, return paragraphs, title -def get_router(batch_cache: BatchCache, queued_batches: Queue): - router = APIRouter(prefix="/crawler", tags=["crawler"]) +def create_router(batch_cache: BatchCache, queued_batches: Queue) -> Router: + router = Router(tags=["crawler"]) - @router.on_event("startup") - async def on_startup(): - with Database() as db: - url_db = URLDatabase(db.connection) - return url_db.create_tables() + # TODO: # ensure tables are created before crawler code is used: + # # + # # url_db.create_tables() @router.get('/fetch') - def fetch_url(url: str, query: str): + def fetch_url(request, url: str, query: str): response = requests.get(url) paragraphs, title = justext_with_dom(response.content, justext.get_stoplist("English")) good_paragraphs = [p for p in paragraphs if p.class_type == 'good'] @@ -105,7 +104,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): return format_result(result, query) @router.post('/batches/') - def post_batch(batch: Batch): + def post_batch(request, batch: Batch): if len(batch.items) > MAX_BATCH_SIZE: raise HTTPException(400, f"Batch size too large (maximum {MAX_BATCH_SIZE}), got {len(batch.items)}") @@ -159,7 +158,7 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): } @router.post('/batches/new') - def request_new_batch(batch_request: NewBatchRequest) -> list[str]: + def request_new_batch(request, batch_request: NewBatchRequest) -> list[str]: user_id_hash = _get_user_id_hash(batch_request) try: urls = queued_batches.get(block=False) @@ -174,14 +173,14 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): return urls @router.get('/batches/{date_str}/users/{public_user_id}') - def get_batches_for_date_and_user(date_str, public_user_id): + def get_batches_for_date_and_user(request, date_str, public_user_id): check_date_str(date_str) check_public_user_id(public_user_id) prefix = f'1/{VERSION}/{date_str}/1/{public_user_id}/' return get_batch_ids_for_prefix(prefix) @router.get('/batches/{date_str}/users/{public_user_id}/batch/{batch_id}') - def get_batch_from_id(date_str, public_user_id, batch_id): + def get_batch_from_id(request, date_str, public_user_id, batch_id): url = get_batch_url(batch_id, date_str, public_user_id) data = json.loads(gzip.decompress(requests.get(url).content)) return { @@ -189,22 +188,22 @@ def get_router(batch_cache: BatchCache, queued_batches: Queue): 'batch': data, } - @router.get('/latest-batch', response_model=list[HashedBatch]) - def get_latest_batch(): + @router.get('/latest-batch') + def get_latest_batch(request) -> list[HashedBatch]: return [] if last_batch is None else [last_batch] @router.get('/batches/{date_str}/users') - def get_user_id_hashes_for_date(date_str: str): + def get_user_id_hashes_for_date(request, date_str: str): check_date_str(date_str) prefix = f'1/{VERSION}/{date_str}/1/' return get_subfolders(prefix) @router.get('/stats') - def get_stats() -> MwmblStats: + def get_stats(request) -> MwmblStats: return stats_manager.get_stats() @router.get('/') - def status(): + def status(request): return { 'status': 'ok' } diff --git a/mwmbl/crawler/batch.py b/mwmbl/crawler/batch.py index b6b3a35..7d7f064 100644 --- a/mwmbl/crawler/batch.py +++ b/mwmbl/crawler/batch.py @@ -1,21 +1,21 @@ from typing import Optional -from pydantic import BaseModel +from ninja import Schema -class ItemContent(BaseModel): +class ItemContent(Schema): title: str extract: str links: list[str] extra_links: Optional[list[str]] -class ItemError(BaseModel): +class ItemError(Schema): name: str message: Optional[str] -class Item(BaseModel): +class Item(Schema): url: str status: Optional[int] timestamp: int @@ -23,16 +23,16 @@ class Item(BaseModel): error: Optional[ItemError] -class Batch(BaseModel): +class Batch(Schema): user_id: str items: list[Item] -class NewBatchRequest(BaseModel): +class NewBatchRequest(Schema): user_id: str -class HashedBatch(BaseModel): +class HashedBatch(Schema): user_id_hash: str timestamp: int items: list[Item] diff --git a/mwmbl/main.py b/mwmbl/main.py index 3c25209..08518e3 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -83,7 +83,7 @@ def run(): app.include_router(search_router) batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME) - crawler_router = crawler.get_router(batch_cache, queued_batches) + crawler_router = crawler.create_router(batch_cache, queued_batches) app.include_router(crawler_router) user_router = user.create_router(index_path) diff --git a/mwmbl/tinysearchengine/search.py b/mwmbl/tinysearchengine/search.py index bd8e54a..8dae294 100644 --- a/mwmbl/tinysearchengine/search.py +++ b/mwmbl/tinysearchengine/search.py @@ -14,11 +14,11 @@ def create_router(ranker: HeuristicRanker) -> Router: router = Router(tags=["search"]) @router.get("") - def search(s: str): + def search(request, s: str): return ranker.search(s) @router.get("/complete") - def complete(q: str): + def complete(request, q: str): return ranker.complete(q) return router From a1d6fd8bb13b43a0b07e1f245ccc9c9e84dcb8c9 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 8 Oct 2023 21:20:32 +0100 Subject: [PATCH 6/8] Start background processes --- app/apps.py | 22 +++++++++++ app/settings.py | 4 +- mwmbl/main.py | 97 ------------------------------------------------- 3 files changed, 24 insertions(+), 99 deletions(-) create mode 100644 app/apps.py delete mode 100644 mwmbl/main.py diff --git a/app/apps.py b/app/apps.py new file mode 100644 index 0000000..d4ad331 --- /dev/null +++ b/app/apps.py @@ -0,0 +1,22 @@ +import os +from multiprocessing import Process, Queue + +from django.apps import AppConfig + +from app import settings +from app.api import queued_batches +from mwmbl import background +from mwmbl.indexer.update_urls import update_urls_continuously +from mwmbl.url_queue import update_queue_continuously + + +class MwmblConfig(AppConfig): + name = "app" + verbose_name = "Mwmbl Application" + + def ready(self): + if os.environ.get('RUN_MAIN') and settings.RUN_BACKGROUND_PROCESSES: + new_item_queue = Queue() + Process(target=background.run, args=(settings.DATA_PATH,)).start() + Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() + Process(target=update_urls_continuously, args=(settings.DATA_PATH, new_item_queue)).start() diff --git a/app/settings.py b/app/settings.py index f483d35..67f0d50 100644 --- a/app/settings.py +++ b/app/settings.py @@ -37,6 +37,7 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', + 'app', ] MIDDLEWARE = [ @@ -125,5 +126,4 @@ DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' # ===================== Custom Settings ========================= DATA_PATH = "./devdata" - - +RUN_BACKGROUND_PROCESSES = True diff --git a/mwmbl/main.py b/mwmbl/main.py deleted file mode 100644 index 08518e3..0000000 --- a/mwmbl/main.py +++ /dev/null @@ -1,97 +0,0 @@ -import argparse -import logging -import sys -from multiprocessing import Process, Queue -from pathlib import Path - -import uvicorn -from fastapi import FastAPI -from starlette.middleware.cors import CORSMiddleware - -from mwmbl import background -from mwmbl.crawler import app as crawler -from mwmbl.indexer.batch_cache import BatchCache -from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME -from mwmbl.platform import user -from mwmbl.indexer.update_urls import update_urls_continuously -from mwmbl.tinysearchengine import search -from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE -from mwmbl.tinysearchengine.rank import HeuristicRanker -from mwmbl.url_queue import update_queue_continuously - -FORMAT = '%(levelname)s %(name)s %(asctime)s %(message)s' -logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=FORMAT) - - -MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle' - - -def setup_args(): - parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor") - parser.add_argument("--num-pages", type=int, help="Number of pages of memory (4096 bytes) to use for the index", default=2560) - parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata") - parser.add_argument("--port", type=int, help="Port for the server to listen at", default=5000) - parser.add_argument("--background", help="Enable running the background tasks to process batches", - action='store_true') - args = parser.parse_args() - return args - - -def run(): - args = setup_args() - - index_path = Path(args.data) / INDEX_NAME - try: - existing_index = TinyIndex(item_factory=Document, index_path=index_path) - if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages: - raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages " - f"({existing_index.num_pages}) do not match") - except FileNotFoundError: - print("Creating a new index") - TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE) - - new_item_queue = Queue() - queued_batches = Queue() - # curation_queue = Queue() - - if args.background: - Process(target=background.run, args=(args.data,)).start() - Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() - Process(target=update_urls_continuously, args=(args.data, new_item_queue)).start() - - completer = Completer() - - with TinyIndex(item_factory=Document, index_path=index_path) as tiny_index: - ranker = HeuristicRanker(tiny_index, completer) - # model = pickle.load(open(MODEL_PATH, 'rb')) - # ranker = LTRRanker(model, tiny_index, completer) - - # Initialize FastApi instance - app = FastAPI() - - # Try disabling since this is handled by nginx - # app.add_middleware( - # CORSMiddleware, - # allow_origins=["*"], - # allow_credentials=True, - # allow_methods=["*"], - # allow_headers=["*"], - # ) - - search_router = search.create_router(ranker) - app.include_router(search_router) - - batch_cache = BatchCache(Path(args.data) / BATCH_DIR_NAME) - crawler_router = crawler.create_router(batch_cache, queued_batches) - app.include_router(crawler_router) - - user_router = user.create_router(index_path) - app.include_router(user_router) - - # Initialize uvicorn server using global app instance and server config params - uvicorn.run(app, host="0.0.0.0", port=args.port) - - -if __name__ == "__main__": - run() From fab5e5c782e906dfe27d716dd5684f6777799c36 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 8 Oct 2023 21:42:04 +0100 Subject: [PATCH 7/8] Use different dev and prod settings --- app/api.py | 2 +- app/apps.py | 19 ++++++++++++++++--- app/{settings.py => settings_common.py} | 4 ---- app/settings_dev.py | 5 +++++ app/settings_prod.py | 5 +++++ 5 files changed, 27 insertions(+), 8 deletions(-) rename app/{settings.py => settings_common.py} (96%) create mode 100644 app/settings_dev.py create mode 100644 app/settings_prod.py diff --git a/app/api.py b/app/api.py index af959e9..713bfc3 100644 --- a/app/api.py +++ b/app/api.py @@ -1,9 +1,9 @@ from multiprocessing import Queue from pathlib import Path +from django.conf import settings from ninja import NinjaAPI -from app import settings import mwmbl.crawler.app as crawler from mwmbl.indexer.batch_cache import BatchCache from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME diff --git a/app/apps.py b/app/apps.py index d4ad331..166aaff 100644 --- a/app/apps.py +++ b/app/apps.py @@ -1,12 +1,14 @@ -import os from multiprocessing import Process, Queue +from pathlib import Path from django.apps import AppConfig +from django.conf import settings -from app import settings from app.api import queued_batches from mwmbl import background +from mwmbl.indexer.paths import INDEX_NAME from mwmbl.indexer.update_urls import update_urls_continuously +from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE from mwmbl.url_queue import update_queue_continuously @@ -15,7 +17,18 @@ class MwmblConfig(AppConfig): verbose_name = "Mwmbl Application" def ready(self): - if os.environ.get('RUN_MAIN') and settings.RUN_BACKGROUND_PROCESSES: + index_path = Path(settings.DATA_PATH) / INDEX_NAME + try: + existing_index = TinyIndex(item_factory=Document, index_path=index_path) + if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != settings.NUM_PAGES: + raise ValueError(f"Existing index page sizes ({existing_index.page_size}) or number of pages " + f"({existing_index.num_pages}) do not match") + except FileNotFoundError: + print("Creating a new index") + TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES, + page_size=PAGE_SIZE) + + if settings.RUN_BACKGROUND_PROCESSES: new_item_queue = Queue() Process(target=background.run, args=(settings.DATA_PATH,)).start() Process(target=update_queue_continuously, args=(new_item_queue, queued_batches,)).start() diff --git a/app/settings.py b/app/settings_common.py similarity index 96% rename from app/settings.py rename to app/settings_common.py index 67f0d50..2753dc6 100644 --- a/app/settings.py +++ b/app/settings_common.py @@ -123,7 +123,3 @@ STATIC_URL = 'static/' DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' -# ===================== Custom Settings ========================= - -DATA_PATH = "./devdata" -RUN_BACKGROUND_PROCESSES = True diff --git a/app/settings_dev.py b/app/settings_dev.py new file mode 100644 index 0000000..bb8e33f --- /dev/null +++ b/app/settings_dev.py @@ -0,0 +1,5 @@ +from app.settings_common import * + +DATA_PATH = "./devdata" +RUN_BACKGROUND_PROCESSES = False +NUM_PAGES = 2560 diff --git a/app/settings_prod.py b/app/settings_prod.py new file mode 100644 index 0000000..37c9cf3 --- /dev/null +++ b/app/settings_prod.py @@ -0,0 +1,5 @@ +from app.settings_common import * + +DATA_PATH = "/app/storage" +RUN_BACKGROUND_PROCESSES = True +NUM_PAGES = 10240000 From 918eaa8709c0f38739474d15392d60a0482ec5a6 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Tue, 10 Oct 2023 13:51:06 +0100 Subject: [PATCH 8/8] Rename django app to mwmbl --- Dockerfile | 7 +++++-- analyse/analyse_crawled_domains.py | 4 ++-- analyse/export_top_domains.py | 2 +- analyse/export_urls.py | 2 +- analyse/index_local.py | 7 +++---- analyse/index_url_count.py | 2 +- analyse/inspect_index.py | 6 +++--- analyse/record_historical_batches.py | 4 +--- analyse/search.py | 4 ++-- analyse/send_batch.py | 2 +- analyse/update_urls.py | 2 +- app/__init__.py | 0 {app => mwmbl}/api.py | 0 {app => mwmbl}/apps.py | 4 ++-- {app => mwmbl}/asgi.py | 2 +- mwmbl/crawler/app.py | 2 +- mwmbl/crawler/urls.py | 5 +---- mwmbl/indexer/batch_cache.py | 1 - mwmbl/indexer/index.py | 5 +---- mwmbl/indexer/update_urls.py | 5 +---- mwmbl/main.py | 9 +++++++++ mwmbl/platform/user.py | 2 +- {app => mwmbl}/settings_common.py | 8 ++++---- {app => mwmbl}/settings_dev.py | 2 +- {app => mwmbl}/settings_prod.py | 2 +- mwmbl/tinysearchengine/rank.py | 1 - mwmbl/url_queue.py | 1 - {app => mwmbl}/urls.py | 2 +- {app => mwmbl}/wsgi.py | 2 +- test/test_completer.py | 12 +++++------- test/test_indexer.py | 6 +++--- test/test_update_urls.py | 2 +- 32 files changed, 55 insertions(+), 60 deletions(-) delete mode 100644 app/__init__.py rename {app => mwmbl}/api.py (100%) rename {app => mwmbl}/apps.py (96%) rename {app => mwmbl}/asgi.py (82%) create mode 100644 mwmbl/main.py rename {app => mwmbl}/settings_common.py (96%) rename {app => mwmbl}/settings_dev.py (67%) rename {app => mwmbl}/settings_prod.py (69%) rename {app => mwmbl}/urls.py (96%) rename {app => mwmbl}/wsgi.py (82%) diff --git a/Dockerfile b/Dockerfile index f57bce7..4283a14 100644 --- a/Dockerfile +++ b/Dockerfile @@ -46,5 +46,8 @@ VOLUME ["/data"] EXPOSE 5000 -# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl -CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"] +ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev + +# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/" +# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"] +CMD ["/venv/bin/mwmbl-tinysearchengine"] diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py index 371cbb6..5e87abb 100644 --- a/analyse/analyse_crawled_domains.py +++ b/analyse/analyse_crawled_domains.py @@ -7,8 +7,8 @@ import json from collections import defaultdict, Counter from urllib.parse import urlparse -from mwmbl.crawler.batch import HashedBatch -from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR +from mwmbl.crawler import HashedBatch +from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR # TODO: remove this line - temporary override diff --git a/analyse/export_top_domains.py b/analyse/export_top_domains.py index 9f4d495..b9b4479 100644 --- a/analyse/export_top_domains.py +++ b/analyse/export_top_domains.py @@ -1,6 +1,6 @@ import json -from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH +from mwmbl.indexer import TOP_DOMAINS_JSON_PATH from mwmbl.hn_top_domains_filtered import DOMAINS diff --git a/analyse/export_urls.py b/analyse/export_urls.py index a042260..39ba98e 100644 --- a/analyse/export_urls.py +++ b/analyse/export_urls.py @@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation. """ import sqlite3 -from mwmbl.indexer.paths import URLS_PATH +from mwmbl.indexer import URLS_PATH from mwmbl.app import get_config_and_index diff --git a/analyse/index_local.py b/analyse/index_local.py index 24628a5..334868d 100644 --- a/analyse/index_local.py +++ b/analyse/index_local.py @@ -7,16 +7,15 @@ import json import logging import os import sys -from pathlib import Path from datetime import datetime import spacy -from mwmbl.crawler.batch import HashedBatch +from mwmbl.crawler import HashedBatch from mwmbl.crawler.urls import URLDatabase from mwmbl.database import Database -from mwmbl.indexer.index_batches import index_batches -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.indexer import index_batches +from mwmbl.tinysearchengine import TinyIndex, Document LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz' NUM_BATCHES = 10000 diff --git a/analyse/index_url_count.py b/analyse/index_url_count.py index f0c7ac2..dcb7245 100644 --- a/analyse/index_url_count.py +++ b/analyse/index_url_count.py @@ -1,7 +1,7 @@ """ Count unique URLs in the index. """ -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine import TinyIndex, Document def run(): diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py index 20b0619..c48ad22 100644 --- a/analyse/inspect_index.py +++ b/analyse/inspect_index.py @@ -5,9 +5,9 @@ import numpy as np import spacy from analyse.index_local import EVALUATE_INDEX_PATH -from mwmbl.indexer.index import tokenize_document -from mwmbl.indexer.paths import INDEX_PATH -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.indexer import tokenize_document +from mwmbl.indexer import INDEX_PATH +from mwmbl.tinysearchengine import TinyIndex, Document logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) diff --git a/analyse/record_historical_batches.py b/analyse/record_historical_batches.py index 4d8ccd3..c482e49 100644 --- a/analyse/record_historical_batches.py +++ b/analyse/record_historical_batches.py @@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled. import glob import gzip import json -from collections import defaultdict, Counter -from urllib.parse import urlparse import requests -from mwmbl.indexer.paths import CRAWL_GLOB +from mwmbl.indexer import CRAWL_GLOB API_ENDPOINT = "http://95.216.215.29/batches/historical" diff --git a/analyse/search.py b/analyse/search.py index 4ffbd54..4bc3b72 100644 --- a/analyse/search.py +++ b/analyse/search.py @@ -2,9 +2,9 @@ import logging import sys from itertools import islice -from mwmbl.indexer.paths import INDEX_PATH +from mwmbl.indexer import INDEX_PATH from mwmbl.tinysearchengine.completer import Completer -from mwmbl.tinysearchengine.indexer import TinyIndex, Document +from mwmbl.tinysearchengine import TinyIndex, Document from mwmbl.tinysearchengine.rank import HeuristicRanker logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) diff --git a/analyse/send_batch.py b/analyse/send_batch.py index 9191834..6d52d41 100644 --- a/analyse/send_batch.py +++ b/analyse/send_batch.py @@ -3,7 +3,7 @@ Send a batch to a running instance. """ import requests -from mwmbl.crawler.batch import Batch, Item, ItemContent +from mwmbl.crawler import Batch, Item, ItemContent URL = 'http://localhost:5000/crawler/batches/' diff --git a/analyse/update_urls.py b/analyse/update_urls.py index 0655df7..f26c804 100644 --- a/analyse/update_urls.py +++ b/analyse/update_urls.py @@ -4,7 +4,7 @@ from datetime import datetime from pathlib import Path from queue import Queue -from mwmbl.indexer.update_urls import record_urls_in_database +from mwmbl.indexer import record_urls_in_database def run_update_urls_on_fixed_batches(): diff --git a/app/__init__.py b/app/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/app/api.py b/mwmbl/api.py similarity index 100% rename from app/api.py rename to mwmbl/api.py diff --git a/app/apps.py b/mwmbl/apps.py similarity index 96% rename from app/apps.py rename to mwmbl/apps.py index 166aaff..bfc21a5 100644 --- a/app/apps.py +++ b/mwmbl/apps.py @@ -4,7 +4,7 @@ from pathlib import Path from django.apps import AppConfig from django.conf import settings -from app.api import queued_batches +from mwmbl.api import queued_batches from mwmbl import background from mwmbl.indexer.paths import INDEX_NAME from mwmbl.indexer.update_urls import update_urls_continuously @@ -13,7 +13,7 @@ from mwmbl.url_queue import update_queue_continuously class MwmblConfig(AppConfig): - name = "app" + name = "mwmbl" verbose_name = "Mwmbl Application" def ready(self): diff --git a/app/asgi.py b/mwmbl/asgi.py similarity index 82% rename from app/asgi.py rename to mwmbl/asgi.py index c8d5aaa..73088a9 100644 --- a/app/asgi.py +++ b/mwmbl/asgi.py @@ -11,6 +11,6 @@ import os from django.core.asgi import get_asgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev') application = get_asgi_application() diff --git a/mwmbl/crawler/app.py b/mwmbl/crawler/app.py index bda2dc8..a4f0524 100644 --- a/mwmbl/crawler/app.py +++ b/mwmbl/crawler/app.py @@ -10,7 +10,7 @@ from uuid import uuid4 import boto3 import justext import requests -from fastapi import HTTPException, APIRouter +from fastapi import HTTPException from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \ LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \ STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor diff --git a/mwmbl/crawler/urls.py b/mwmbl/crawler/urls.py index cefe19e..7c83edf 100644 --- a/mwmbl/crawler/urls.py +++ b/mwmbl/crawler/urls.py @@ -1,16 +1,13 @@ """ Database storing info on URLs """ -import random from dataclasses import dataclass -from datetime import datetime, timedelta +from datetime import datetime from enum import Enum from logging import getLogger from psycopg2.extras import execute_values -from mwmbl.hn_top_domains_filtered import DOMAINS -from mwmbl.settings import CORE_DOMAINS # Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned from mwmbl.utils import batch diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py index e7af6db..01d8cc9 100644 --- a/mwmbl/indexer/batch_cache.py +++ b/mwmbl/indexer/batch_cache.py @@ -9,7 +9,6 @@ import os from logging import getLogger from multiprocessing.pool import ThreadPool from pathlib import Path -from tempfile import NamedTemporaryFile from urllib.parse import urlparse from pydantic import ValidationError diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 4edcb8a..fb61405 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -1,13 +1,10 @@ """ Create a search index """ -from collections import Counter from typing import Iterable from urllib.parse import unquote -import pandas as pd - -from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex +from mwmbl.tinysearchengine.indexer import TokenizedDocument from mwmbl.tokenizer import tokenize, get_bigrams DEFAULT_SCORE = 0 diff --git a/mwmbl/indexer/update_urls.py b/mwmbl/indexer/update_urls.py index 3819777..8a1b973 100644 --- a/mwmbl/indexer/update_urls.py +++ b/mwmbl/indexer/update_urls.py @@ -1,13 +1,10 @@ -import os -import pickle -import re from collections import defaultdict from datetime import datetime, timezone, timedelta from logging import getLogger from multiprocessing import Queue from pathlib import Path from time import sleep -from typing import Iterable, Collection +from typing import Collection from urllib.parse import urlparse from requests_cache import CachedSession diff --git a/mwmbl/main.py b/mwmbl/main.py new file mode 100644 index 0000000..0281edc --- /dev/null +++ b/mwmbl/main.py @@ -0,0 +1,9 @@ +import uvicorn + + +def run(): + uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000) + + +if __name__ == "__main__": + run() diff --git a/mwmbl/platform/user.py b/mwmbl/platform/user.py index a3006c4..bbdcb0e 100644 --- a/mwmbl/platform/user.py +++ b/mwmbl/platform/user.py @@ -7,7 +7,7 @@ import requests from fastapi import APIRouter, Response from pydantic import BaseModel -from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState +from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tokenizer import tokenize diff --git a/app/settings_common.py b/mwmbl/settings_common.py similarity index 96% rename from app/settings_common.py rename to mwmbl/settings_common.py index 2753dc6..b08b62c 100644 --- a/app/settings_common.py +++ b/mwmbl/settings_common.py @@ -1,5 +1,5 @@ """ -Django settings for app project. +Django settings for mwmbl project. Generated by 'django-admin startproject' using Django 4.2.4. @@ -37,7 +37,7 @@ INSTALLED_APPS = [ 'django.contrib.sessions', 'django.contrib.messages', 'django.contrib.staticfiles', - 'app', + 'mwmbl', ] MIDDLEWARE = [ @@ -50,7 +50,7 @@ MIDDLEWARE = [ 'django.middleware.clickjacking.XFrameOptionsMiddleware', ] -ROOT_URLCONF = 'app.urls' +ROOT_URLCONF = 'mwmbl.urls' TEMPLATES = [ { @@ -68,7 +68,7 @@ TEMPLATES = [ }, ] -WSGI_APPLICATION = 'app.wsgi.application' +WSGI_APPLICATION = 'mwmbl.wsgi.application' # Database diff --git a/app/settings_dev.py b/mwmbl/settings_dev.py similarity index 67% rename from app/settings_dev.py rename to mwmbl/settings_dev.py index bb8e33f..fe07890 100644 --- a/app/settings_dev.py +++ b/mwmbl/settings_dev.py @@ -1,4 +1,4 @@ -from app.settings_common import * +from mwmbl.settings_common import * DATA_PATH = "./devdata" RUN_BACKGROUND_PROCESSES = False diff --git a/app/settings_prod.py b/mwmbl/settings_prod.py similarity index 69% rename from app/settings_prod.py rename to mwmbl/settings_prod.py index 37c9cf3..f7c50ee 100644 --- a/app/settings_prod.py +++ b/mwmbl/settings_prod.py @@ -1,4 +1,4 @@ -from app.settings_common import * +from mwmbl.settings_common import * DATA_PATH = "/app/storage" RUN_BACKGROUND_PROCESSES = True diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 81109fd..7f331b8 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -6,7 +6,6 @@ from operator import itemgetter from urllib.parse import urlparse from mwmbl.format import format_result_with_pattern, get_query_regex -from mwmbl.platform.user import MAX_CURATED_SCORE from mwmbl.tokenizer import tokenize, get_bigrams from mwmbl.tinysearchengine.completer import Completer from mwmbl.hn_top_domains_filtered import DOMAINS diff --git a/mwmbl/url_queue.py b/mwmbl/url_queue.py index ab0f1bc..8151550 100644 --- a/mwmbl/url_queue.py +++ b/mwmbl/url_queue.py @@ -1,6 +1,5 @@ import time from collections import defaultdict -from dataclasses import dataclass from datetime import datetime, timedelta from logging import getLogger from multiprocessing import Queue diff --git a/app/urls.py b/mwmbl/urls.py similarity index 96% rename from app/urls.py rename to mwmbl/urls.py index 440a2f4..ff67f2d 100644 --- a/app/urls.py +++ b/mwmbl/urls.py @@ -17,7 +17,7 @@ Including another URLconf from django.contrib import admin from django.urls import path -from app.api import api +from mwmbl.api import api urlpatterns = [ path('admin/', admin.site.urls), diff --git a/app/wsgi.py b/mwmbl/wsgi.py similarity index 82% rename from app/wsgi.py rename to mwmbl/wsgi.py index ef30895..ebdf0ff 100644 --- a/app/wsgi.py +++ b/mwmbl/wsgi.py @@ -11,6 +11,6 @@ import os from django.core.wsgi import get_wsgi_application -os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings') +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev') application = get_wsgi_application() diff --git a/test/test_completer.py b/test/test_completer.py index b1fb49e..8867f26 100644 --- a/test/test_completer.py +++ b/test/test_completer.py @@ -1,5 +1,3 @@ -import mwmbl.tinysearchengine.completer -import pytest import pandas as pd def mockCompleterData(mocker, data): @@ -16,7 +14,7 @@ def test_correctCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('build') assert ['build', 'builder', 'buildings'] == completion @@ -29,7 +27,7 @@ def test_correctSortOrder(mocker): [3, 'buildings', 3]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('build') assert ['build', 'buildings', 'builder'] == completion @@ -42,7 +40,7 @@ def test_noCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('test') assert [] == completion @@ -55,7 +53,7 @@ def test_singleCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() completion = completer.complete('announce') assert ['announce'] == completion @@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker): [3, 'buildings', 1]] mockCompleterData(mocker, testdata) - completer = mwmbl.tinysearchengine.completer.Completer() + completer = app.tinysearchengine.completer.Completer() for i in range(3): print(f"iteration: {i}") completion = completer.complete('build') diff --git a/test/test_indexer.py b/test/test_indexer.py index dd25b18..cf714c0 100644 --- a/test/test_indexer.py +++ b/test/test_indexer.py @@ -1,9 +1,9 @@ from pathlib import Path from tempfile import TemporaryDirectory -from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size -from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError -import json +from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size +from zstandard import ZstdCompressor + def test_create_index(): num_pages = 10 diff --git a/test/test_update_urls.py b/test/test_update_urls.py index 8f205f8..089caea 100644 --- a/test/test_update_urls.py +++ b/test/test_update_urls.py @@ -1,4 +1,4 @@ -from mwmbl.indexer.update_urls import process_link +from mwmbl.indexer import process_link def test_process_link_normal():