Rename django app to mwmbl

This commit is contained in:
Daoud Clarke 2023-10-10 13:51:06 +01:00
parent fab5e5c782
commit 918eaa8709
32 changed files with 55 additions and 60 deletions

View file

@ -46,5 +46,8 @@ VOLUME ["/data"]
EXPOSE 5000
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
CMD ["/venv/bin/mwmbl-tinysearchengine"]

View file

@ -7,8 +7,8 @@ import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
from mwmbl.crawler.batch import HashedBatch
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
from mwmbl.crawler import HashedBatch
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
# TODO: remove this line - temporary override

View file

@ -1,6 +1,6 @@
import json
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
"""
import sqlite3
from mwmbl.indexer.paths import URLS_PATH
from mwmbl.indexer import URLS_PATH
from mwmbl.app import get_config_and_index

View file

@ -7,16 +7,15 @@ import json
import logging
import os
import sys
from pathlib import Path
from datetime import datetime
import spacy
from mwmbl.crawler.batch import HashedBatch
from mwmbl.crawler import HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer.index_batches import index_batches
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.indexer import index_batches
from mwmbl.tinysearchengine import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000

View file

@ -1,7 +1,7 @@
"""
Count unique URLs in the index.
"""
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine import TinyIndex, Document
def run():

View file

@ -5,9 +5,9 @@ import numpy as np
import spacy
from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.indexer import tokenize_document
from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine import TinyIndex, Document
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests
from mwmbl.indexer.paths import CRAWL_GLOB
from mwmbl.indexer import CRAWL_GLOB
API_ENDPOINT = "http://95.216.215.29/batches/historical"

View file

@ -2,9 +2,9 @@ import logging
import sys
from itertools import islice
from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.indexer import INDEX_PATH
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine import TinyIndex, Document
from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

View file

@ -3,7 +3,7 @@ Send a batch to a running instance.
"""
import requests
from mwmbl.crawler.batch import Batch, Item, ItemContent
from mwmbl.crawler import Batch, Item, ItemContent
URL = 'http://localhost:5000/crawler/batches/'

View file

@ -4,7 +4,7 @@ from datetime import datetime
from pathlib import Path
from queue import Queue
from mwmbl.indexer.update_urls import record_urls_in_database
from mwmbl.indexer import record_urls_in_database
def run_update_urls_on_fixed_batches():

View file

View file

@ -4,7 +4,7 @@ from pathlib import Path
from django.apps import AppConfig
from django.conf import settings
from app.api import queued_batches
from mwmbl.api import queued_batches
from mwmbl import background
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.indexer.update_urls import update_urls_continuously
@ -13,7 +13,7 @@ from mwmbl.url_queue import update_queue_continuously
class MwmblConfig(AppConfig):
name = "app"
name = "mwmbl"
verbose_name = "Mwmbl Application"
def ready(self):

View file

@ -11,6 +11,6 @@ import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_asgi_application()

View file

@ -10,7 +10,7 @@ from uuid import uuid4
import boto3
import justext
import requests
from fastapi import HTTPException, APIRouter
from fastapi import HTTPException
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor

View file

@ -1,16 +1,13 @@
"""
Database storing info on URLs
"""
import random
from dataclasses import dataclass
from datetime import datetime, timedelta
from datetime import datetime
from enum import Enum
from logging import getLogger
from psycopg2.extras import execute_values
from mwmbl.hn_top_domains_filtered import DOMAINS
from mwmbl.settings import CORE_DOMAINS
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
from mwmbl.utils import batch

View file

@ -9,7 +9,6 @@ import os
from logging import getLogger
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
from pydantic import ValidationError

View file

@ -1,13 +1,10 @@
"""
Create a search index
"""
from collections import Counter
from typing import Iterable
from urllib.parse import unquote
import pandas as pd
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
from mwmbl.tinysearchengine.indexer import TokenizedDocument
from mwmbl.tokenizer import tokenize, get_bigrams
DEFAULT_SCORE = 0

View file

@ -1,13 +1,10 @@
import os
import pickle
import re
from collections import defaultdict
from datetime import datetime, timezone, timedelta
from logging import getLogger
from multiprocessing import Queue
from pathlib import Path
from time import sleep
from typing import Iterable, Collection
from typing import Collection
from urllib.parse import urlparse
from requests_cache import CachedSession

9
mwmbl/main.py Normal file
View file

@ -0,0 +1,9 @@
import uvicorn
def run():
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
if __name__ == "__main__":
run()

View file

@ -7,7 +7,7 @@ import requests
from fastapi import APIRouter, Response
from pydantic import BaseModel
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tokenizer import tokenize

View file

@ -1,5 +1,5 @@
"""
Django settings for app project.
Django settings for mwmbl project.
Generated by 'django-admin startproject' using Django 4.2.4.
@ -37,7 +37,7 @@ INSTALLED_APPS = [
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'app',
'mwmbl',
]
MIDDLEWARE = [
@ -50,7 +50,7 @@ MIDDLEWARE = [
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'app.urls'
ROOT_URLCONF = 'mwmbl.urls'
TEMPLATES = [
{
@ -68,7 +68,7 @@ TEMPLATES = [
},
]
WSGI_APPLICATION = 'app.wsgi.application'
WSGI_APPLICATION = 'mwmbl.wsgi.application'
# Database

View file

@ -1,4 +1,4 @@
from app.settings_common import *
from mwmbl.settings_common import *
DATA_PATH = "./devdata"
RUN_BACKGROUND_PROCESSES = False

View file

@ -1,4 +1,4 @@
from app.settings_common import *
from mwmbl.settings_common import *
DATA_PATH = "/app/storage"
RUN_BACKGROUND_PROCESSES = True

View file

@ -6,7 +6,6 @@ from operator import itemgetter
from urllib.parse import urlparse
from mwmbl.format import format_result_with_pattern, get_query_regex
from mwmbl.platform.user import MAX_CURATED_SCORE
from mwmbl.tokenizer import tokenize, get_bigrams
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.hn_top_domains_filtered import DOMAINS

View file

@ -1,6 +1,5 @@
import time
from collections import defaultdict
from dataclasses import dataclass
from datetime import datetime, timedelta
from logging import getLogger
from multiprocessing import Queue

View file

@ -17,7 +17,7 @@ Including another URLconf
from django.contrib import admin
from django.urls import path
from app.api import api
from mwmbl.api import api
urlpatterns = [
path('admin/', admin.site.urls),

View file

@ -11,6 +11,6 @@ import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
application = get_wsgi_application()

View file

@ -1,5 +1,3 @@
import mwmbl.tinysearchengine.completer
import pytest
import pandas as pd
def mockCompleterData(mocker, data):
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build')
assert ['build', 'builder', 'buildings'] == completion
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
[3, 'buildings', 3]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('build')
assert ['build', 'buildings', 'builder'] == completion
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('test')
assert [] == completion
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
completion = completer.complete('announce')
assert ['announce'] == completion
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
[3, 'buildings', 1]]
mockCompleterData(mocker, testdata)
completer = mwmbl.tinysearchengine.completer.Completer()
completer = app.tinysearchengine.completer.Completer()
for i in range(3):
print(f"iteration: {i}")
completion = completer.complete('build')

View file

@ -1,9 +1,9 @@
from pathlib import Path
from tempfile import TemporaryDirectory
from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
import json
from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
from zstandard import ZstdCompressor
def test_create_index():
num_pages = 10

View file

@ -1,4 +1,4 @@
from mwmbl.indexer.update_urls import process_link
from mwmbl.indexer import process_link
def test_process_link_normal():