Rename django app to mwmbl
This commit is contained in:
parent
fab5e5c782
commit
918eaa8709
32 changed files with 55 additions and 60 deletions
|
@ -46,5 +46,8 @@ VOLUME ["/data"]
|
||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
|
|
||||||
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
|
ENV DJANGO_SETTINGS_MODULE=mwmbl.settings_dev
|
||||||
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background", "--data", "/app/storage"]
|
|
||||||
|
# WORKDIR "/venv/lib/python3.10/site-packages/mwmbl/"
|
||||||
|
# CMD ["/venv/bin/python", "-m", "uvicorn", "app.asgi:application"]
|
||||||
|
CMD ["/venv/bin/mwmbl-tinysearchengine"]
|
||||||
|
|
|
@ -7,8 +7,8 @@ import json
|
||||||
from collections import defaultdict, Counter
|
from collections import defaultdict, Counter
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from mwmbl.crawler.batch import HashedBatch
|
from mwmbl.crawler import HashedBatch
|
||||||
from mwmbl.indexer.paths import CRAWL_GLOB, MWMBL_DATA_DIR
|
from mwmbl.indexer import CRAWL_GLOB, MWMBL_DATA_DIR
|
||||||
|
|
||||||
|
|
||||||
# TODO: remove this line - temporary override
|
# TODO: remove this line - temporary override
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import json
|
import json
|
||||||
|
|
||||||
from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
|
from mwmbl.indexer import TOP_DOMAINS_JSON_PATH
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,7 @@ Export the list of unique URLs to a SQLite file for analysis/evaluation.
|
||||||
"""
|
"""
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
|
||||||
from mwmbl.indexer.paths import URLS_PATH
|
from mwmbl.indexer import URLS_PATH
|
||||||
from mwmbl.app import get_config_and_index
|
from mwmbl.app import get_config_and_index
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -7,16 +7,15 @@ import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
from mwmbl.crawler.batch import HashedBatch
|
from mwmbl.crawler import HashedBatch
|
||||||
from mwmbl.crawler.urls import URLDatabase
|
from mwmbl.crawler.urls import URLDatabase
|
||||||
from mwmbl.database import Database
|
from mwmbl.database import Database
|
||||||
from mwmbl.indexer.index_batches import index_batches
|
from mwmbl.indexer import index_batches
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
|
|
||||||
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
||||||
NUM_BATCHES = 10000
|
NUM_BATCHES = 10000
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
"""
|
"""
|
||||||
Count unique URLs in the index.
|
Count unique URLs in the index.
|
||||||
"""
|
"""
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
|
|
@ -5,9 +5,9 @@ import numpy as np
|
||||||
import spacy
|
import spacy
|
||||||
|
|
||||||
from analyse.index_local import EVALUATE_INDEX_PATH
|
from analyse.index_local import EVALUATE_INDEX_PATH
|
||||||
from mwmbl.indexer.index import tokenize_document
|
from mwmbl.indexer import tokenize_document
|
||||||
from mwmbl.indexer.paths import INDEX_PATH
|
from mwmbl.indexer import INDEX_PATH
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
|
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||||
|
|
|
@ -4,12 +4,10 @@ See how many unique URLs and root domains we have crawled.
|
||||||
import glob
|
import glob
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
import json
|
||||||
from collections import defaultdict, Counter
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
from mwmbl.indexer import CRAWL_GLOB
|
||||||
|
|
||||||
|
|
||||||
API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
||||||
|
|
|
@ -2,9 +2,9 @@ import logging
|
||||||
import sys
|
import sys
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
|
|
||||||
from mwmbl.indexer.paths import INDEX_PATH
|
from mwmbl.indexer import INDEX_PATH
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
||||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||||
|
|
||||||
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
||||||
|
|
|
@ -3,7 +3,7 @@ Send a batch to a running instance.
|
||||||
"""
|
"""
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from mwmbl.crawler.batch import Batch, Item, ItemContent
|
from mwmbl.crawler import Batch, Item, ItemContent
|
||||||
|
|
||||||
|
|
||||||
URL = 'http://localhost:5000/crawler/batches/'
|
URL = 'http://localhost:5000/crawler/batches/'
|
||||||
|
|
|
@ -4,7 +4,7 @@ from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
|
||||||
from mwmbl.indexer.update_urls import record_urls_in_database
|
from mwmbl.indexer import record_urls_in_database
|
||||||
|
|
||||||
|
|
||||||
def run_update_urls_on_fixed_batches():
|
def run_update_urls_on_fixed_batches():
|
||||||
|
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||||
from django.apps import AppConfig
|
from django.apps import AppConfig
|
||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
|
|
||||||
from app.api import queued_batches
|
from mwmbl.api import queued_batches
|
||||||
from mwmbl import background
|
from mwmbl import background
|
||||||
from mwmbl.indexer.paths import INDEX_NAME
|
from mwmbl.indexer.paths import INDEX_NAME
|
||||||
from mwmbl.indexer.update_urls import update_urls_continuously
|
from mwmbl.indexer.update_urls import update_urls_continuously
|
||||||
|
@ -13,7 +13,7 @@ from mwmbl.url_queue import update_queue_continuously
|
||||||
|
|
||||||
|
|
||||||
class MwmblConfig(AppConfig):
|
class MwmblConfig(AppConfig):
|
||||||
name = "app"
|
name = "mwmbl"
|
||||||
verbose_name = "Mwmbl Application"
|
verbose_name = "Mwmbl Application"
|
||||||
|
|
||||||
def ready(self):
|
def ready(self):
|
|
@ -11,6 +11,6 @@ import os
|
||||||
|
|
||||||
from django.core.asgi import get_asgi_application
|
from django.core.asgi import get_asgi_application
|
||||||
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
|
||||||
|
|
||||||
application = get_asgi_application()
|
application = get_asgi_application()
|
|
@ -10,7 +10,7 @@ from uuid import uuid4
|
||||||
import boto3
|
import boto3
|
||||||
import justext
|
import justext
|
||||||
import requests
|
import requests
|
||||||
from fastapi import HTTPException, APIRouter
|
from fastapi import HTTPException
|
||||||
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
from justext.core import html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification, \
|
||||||
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
LENGTH_LOW_DEFAULT, STOPWORDS_LOW_DEFAULT, MAX_LINK_DENSITY_DEFAULT, NO_HEADINGS_DEFAULT, LENGTH_HIGH_DEFAULT, \
|
||||||
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
STOPWORDS_HIGH_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, preprocessor
|
||||||
|
|
|
@ -1,16 +1,13 @@
|
||||||
"""
|
"""
|
||||||
Database storing info on URLs
|
Database storing info on URLs
|
||||||
"""
|
"""
|
||||||
import random
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
|
|
||||||
from psycopg2.extras import execute_values
|
from psycopg2.extras import execute_values
|
||||||
|
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
|
||||||
from mwmbl.settings import CORE_DOMAINS
|
|
||||||
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
# Client has one hour to crawl a URL that has been assigned to them, or it will be reassigned
|
||||||
from mwmbl.utils import batch
|
from mwmbl.utils import batch
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@ import os
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from multiprocessing.pool import ThreadPool
|
from multiprocessing.pool import ThreadPool
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import NamedTemporaryFile
|
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from pydantic import ValidationError
|
from pydantic import ValidationError
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
"""
|
"""
|
||||||
Create a search index
|
Create a search index
|
||||||
"""
|
"""
|
||||||
from collections import Counter
|
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
|
||||||
import pandas as pd
|
from mwmbl.tinysearchengine.indexer import TokenizedDocument
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex
|
|
||||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||||
|
|
||||||
DEFAULT_SCORE = 0
|
DEFAULT_SCORE = 0
|
||||||
|
|
|
@ -1,13 +1,10 @@
|
||||||
import os
|
|
||||||
import pickle
|
|
||||||
import re
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from datetime import datetime, timezone, timedelta
|
from datetime import datetime, timezone, timedelta
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from multiprocessing import Queue
|
from multiprocessing import Queue
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import Iterable, Collection
|
from typing import Collection
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from requests_cache import CachedSession
|
from requests_cache import CachedSession
|
||||||
|
|
9
mwmbl/main.py
Normal file
9
mwmbl/main.py
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
|
||||||
|
def run():
|
||||||
|
uvicorn.run("mwmbl.asgi:application", host="0.0.0.0", port=8000)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run()
|
|
@ -7,7 +7,7 @@ import requests
|
||||||
from fastapi import APIRouter, Response
|
from fastapi import APIRouter, Response
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, DocumentState
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||||
from mwmbl.tokenizer import tokenize
|
from mwmbl.tokenizer import tokenize
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
"""
|
"""
|
||||||
Django settings for app project.
|
Django settings for mwmbl project.
|
||||||
|
|
||||||
Generated by 'django-admin startproject' using Django 4.2.4.
|
Generated by 'django-admin startproject' using Django 4.2.4.
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ INSTALLED_APPS = [
|
||||||
'django.contrib.sessions',
|
'django.contrib.sessions',
|
||||||
'django.contrib.messages',
|
'django.contrib.messages',
|
||||||
'django.contrib.staticfiles',
|
'django.contrib.staticfiles',
|
||||||
'app',
|
'mwmbl',
|
||||||
]
|
]
|
||||||
|
|
||||||
MIDDLEWARE = [
|
MIDDLEWARE = [
|
||||||
|
@ -50,7 +50,7 @@ MIDDLEWARE = [
|
||||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||||
]
|
]
|
||||||
|
|
||||||
ROOT_URLCONF = 'app.urls'
|
ROOT_URLCONF = 'mwmbl.urls'
|
||||||
|
|
||||||
TEMPLATES = [
|
TEMPLATES = [
|
||||||
{
|
{
|
||||||
|
@ -68,7 +68,7 @@ TEMPLATES = [
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
WSGI_APPLICATION = 'app.wsgi.application'
|
WSGI_APPLICATION = 'mwmbl.wsgi.application'
|
||||||
|
|
||||||
|
|
||||||
# Database
|
# Database
|
|
@ -1,4 +1,4 @@
|
||||||
from app.settings_common import *
|
from mwmbl.settings_common import *
|
||||||
|
|
||||||
DATA_PATH = "./devdata"
|
DATA_PATH = "./devdata"
|
||||||
RUN_BACKGROUND_PROCESSES = False
|
RUN_BACKGROUND_PROCESSES = False
|
|
@ -1,4 +1,4 @@
|
||||||
from app.settings_common import *
|
from mwmbl.settings_common import *
|
||||||
|
|
||||||
DATA_PATH = "/app/storage"
|
DATA_PATH = "/app/storage"
|
||||||
RUN_BACKGROUND_PROCESSES = True
|
RUN_BACKGROUND_PROCESSES = True
|
|
@ -6,7 +6,6 @@ from operator import itemgetter
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
from mwmbl.format import format_result_with_pattern, get_query_regex
|
from mwmbl.format import format_result_with_pattern, get_query_regex
|
||||||
from mwmbl.platform.user import MAX_CURATED_SCORE
|
|
||||||
from mwmbl.tokenizer import tokenize, get_bigrams
|
from mwmbl.tokenizer import tokenize, get_bigrams
|
||||||
from mwmbl.tinysearchengine.completer import Completer
|
from mwmbl.tinysearchengine.completer import Completer
|
||||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import time
|
import time
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from dataclasses import dataclass
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from logging import getLogger
|
from logging import getLogger
|
||||||
from multiprocessing import Queue
|
from multiprocessing import Queue
|
||||||
|
|
|
@ -17,7 +17,7 @@ Including another URLconf
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.urls import path
|
from django.urls import path
|
||||||
|
|
||||||
from app.api import api
|
from mwmbl.api import api
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
path('admin/', admin.site.urls),
|
path('admin/', admin.site.urls),
|
|
@ -11,6 +11,6 @@ import os
|
||||||
|
|
||||||
from django.core.wsgi import get_wsgi_application
|
from django.core.wsgi import get_wsgi_application
|
||||||
|
|
||||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'app.settings')
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mwmbl.settings_dev')
|
||||||
|
|
||||||
application = get_wsgi_application()
|
application = get_wsgi_application()
|
|
@ -1,5 +1,3 @@
|
||||||
import mwmbl.tinysearchengine.completer
|
|
||||||
import pytest
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
def mockCompleterData(mocker, data):
|
def mockCompleterData(mocker, data):
|
||||||
|
@ -16,7 +14,7 @@ def test_correctCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('build')
|
completion = completer.complete('build')
|
||||||
assert ['build', 'builder', 'buildings'] == completion
|
assert ['build', 'builder', 'buildings'] == completion
|
||||||
|
|
||||||
|
@ -29,7 +27,7 @@ def test_correctSortOrder(mocker):
|
||||||
[3, 'buildings', 3]]
|
[3, 'buildings', 3]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('build')
|
completion = completer.complete('build')
|
||||||
assert ['build', 'buildings', 'builder'] == completion
|
assert ['build', 'buildings', 'builder'] == completion
|
||||||
|
|
||||||
|
@ -42,7 +40,7 @@ def test_noCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('test')
|
completion = completer.complete('test')
|
||||||
assert [] == completion
|
assert [] == completion
|
||||||
|
|
||||||
|
@ -55,7 +53,7 @@ def test_singleCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
completion = completer.complete('announce')
|
completion = completer.complete('announce')
|
||||||
assert ['announce'] == completion
|
assert ['announce'] == completion
|
||||||
|
|
||||||
|
@ -68,7 +66,7 @@ def test_idempotencyWithSameScoreCompletions(mocker):
|
||||||
[3, 'buildings', 1]]
|
[3, 'buildings', 1]]
|
||||||
mockCompleterData(mocker, testdata)
|
mockCompleterData(mocker, testdata)
|
||||||
|
|
||||||
completer = mwmbl.tinysearchengine.completer.Completer()
|
completer = app.tinysearchengine.completer.Completer()
|
||||||
for i in range(3):
|
for i in range(3):
|
||||||
print(f"iteration: {i}")
|
print(f"iteration: {i}")
|
||||||
completion = completer.complete('build')
|
completion = completer.complete('build')
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from tempfile import TemporaryDirectory
|
from tempfile import TemporaryDirectory
|
||||||
|
|
||||||
from mwmbl.tinysearchengine.indexer import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
|
from mwmbl.tinysearchengine import Document, TinyIndex, _binary_search_fitting_size, astuple, _trim_items_to_page, _get_page_data, _pad_to_page_size
|
||||||
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
from zstandard import ZstdCompressor
|
||||||
import json
|
|
||||||
|
|
||||||
def test_create_index():
|
def test_create_index():
|
||||||
num_pages = 10
|
num_pages = 10
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from mwmbl.indexer.update_urls import process_link
|
from mwmbl.indexer import process_link
|
||||||
|
|
||||||
|
|
||||||
def test_process_link_normal():
|
def test_process_link_normal():
|
||||||
|
|
Loading…
Reference in a new issue