diff --git a/Dockerfile b/Dockerfile index 784381a..3f525b8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -31,4 +31,4 @@ COPY data /data #COPY docker-entrypoint.sh wsgi.py ./ #CMD ["./docker-entrypoint.sh"] -CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"] +CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"] diff --git a/analyse/inspect_index.py b/analyse/inspect_index.py index f73064a..18a5a96 100644 --- a/analyse/inspect_index.py +++ b/analyse/inspect_index.py @@ -1,5 +1,5 @@ -from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document -from indexer.paths import INDEX_PATH +from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document +from mwmbl.indexer.paths import INDEX_PATH def get_items(): diff --git a/analyse/make_curl.py b/analyse/make_curl.py index c411806..465f990 100644 --- a/analyse/make_curl.py +++ b/analyse/make_curl.py @@ -5,8 +5,8 @@ import os from itertools import islice from urllib.parse import quote -from indexer.paths import DATA_DIR -from indexer.wiki import get_wiki_titles_and_urls +from mwmbl.indexer.paths import DATA_DIR +from mwmbl.indexer.wiki import get_wiki_titles_and_urls URL_TEMPLATE = "http://localhost:8000/complete?q={}" CURL_FILE = os.path.join(DATA_DIR, "urls.curl") diff --git a/analyse/performance.py b/analyse/performance.py index 53fdcae..4a675d4 100644 --- a/analyse/performance.py +++ b/analyse/performance.py @@ -8,11 +8,11 @@ import numpy as np from spacy.lang.en import English from starlette.testclient import TestClient -from tinysearchengine import create_app -from indexer.fsqueue import ZstdJsonSerializer -from indexer.index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document -from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH +from mwmbl.tinysearchengine import create_app +from mwmbl.indexer.fsqueue import ZstdJsonSerializer +from mwmbl.indexer.index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document +from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH NUM_DOCUMENTS = 30000 NUM_PAGES_FOR_STATS = 10 diff --git a/indexer/__init__.py b/mwmbl/__init__.py similarity index 100% rename from indexer/__init__.py rename to mwmbl/__init__.py diff --git a/indexer/domains/__init__.py b/mwmbl/indexer/__init__.py similarity index 100% rename from indexer/domains/__init__.py rename to mwmbl/indexer/__init__.py diff --git a/indexer/bootstrap.sh b/mwmbl/indexer/bootstrap.sh similarity index 100% rename from indexer/bootstrap.sh rename to mwmbl/indexer/bootstrap.sh diff --git a/indexer/crawl.py b/mwmbl/indexer/crawl.py similarity index 94% rename from indexer/crawl.py rename to mwmbl/indexer/crawl.py index 609deb6..11405d0 100644 --- a/indexer/crawl.py +++ b/mwmbl/indexer/crawl.py @@ -10,7 +10,7 @@ from traceback import print_tb, print_exc import pandas as pd import requests -from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX +from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX def crawl(): diff --git a/indexer/deploy.sh b/mwmbl/indexer/deploy.sh similarity index 100% rename from indexer/deploy.sh rename to mwmbl/indexer/deploy.sh diff --git a/indexer/domains.py b/mwmbl/indexer/domains.py similarity index 100% rename from indexer/domains.py rename to mwmbl/indexer/domains.py diff --git a/tinysearchengine/__init__.py b/mwmbl/indexer/domains/__init__.py similarity index 100% rename from tinysearchengine/__init__.py rename to mwmbl/indexer/domains/__init__.py diff --git a/indexer/domains/domain_titles.py b/mwmbl/indexer/domains/domain_titles.py similarity index 94% rename from indexer/domains/domain_titles.py rename to mwmbl/indexer/domains/domain_titles.py index be6203d..907367e 100644 --- a/indexer/domains/domain_titles.py +++ b/mwmbl/indexer/domains/domain_titles.py @@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit import bs4 import requests -from indexer.fsqueue import FSQueue, ZstdJsonSerializer -from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME +from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer +from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME NUM_PROCESSES = 10 diff --git a/indexer/domains/queue_domains.py b/mwmbl/indexer/domains/queue_domains.py similarity index 82% rename from indexer/domains/queue_domains.py rename to mwmbl/indexer/domains/queue_domains.py index 8136de2..3eb7ac6 100644 --- a/indexer/domains/queue_domains.py +++ b/mwmbl/indexer/domains/queue_domains.py @@ -4,8 +4,8 @@ Add domains to the queue to be retrieved import csv import gzip -from indexer.fsqueue import FSQueue, ZstdJsonSerializer -from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR +from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer +from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR BATCH_SIZE = 250 diff --git a/indexer/extract.py b/mwmbl/indexer/extract.py similarity index 100% rename from indexer/extract.py rename to mwmbl/indexer/extract.py diff --git a/indexer/extract_local.py b/mwmbl/indexer/extract_local.py similarity index 92% rename from indexer/extract_local.py rename to mwmbl/indexer/extract_local.py index 040883f..b293f08 100644 --- a/indexer/extract_local.py +++ b/mwmbl/indexer/extract_local.py @@ -4,9 +4,9 @@ import os from glob import glob from multiprocessing import Process, Lock -from extract_process import fetch_process_warc_records -from fsqueue import FSQueue, GzipJsonRowSerializer -from paths import DATA_DIR +from .extract_process import fetch_process_warc_records +from .fsqueue import FSQueue, GzipJsonRowSerializer +from .paths import DATA_DIR ARCHIVE_INFO_GLOB = 'outputs/records/*.gz' diff --git a/indexer/extract_process.py b/mwmbl/indexer/extract_process.py similarity index 100% rename from indexer/extract_process.py rename to mwmbl/indexer/extract_process.py diff --git a/indexer/fsqueue.py b/mwmbl/indexer/fsqueue.py similarity index 100% rename from indexer/fsqueue.py rename to mwmbl/indexer/fsqueue.py diff --git a/indexer/hn-top-domains-filtered.py b/mwmbl/indexer/hn-top-domains-filtered.py similarity index 100% rename from indexer/hn-top-domains-filtered.py rename to mwmbl/indexer/hn-top-domains-filtered.py diff --git a/indexer/index.py b/mwmbl/indexer/index.py similarity index 96% rename from indexer/index.py rename to mwmbl/indexer/index.py index 3350560..d0f0efe 100644 --- a/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -10,7 +10,7 @@ import pandas as pd # NUM_PAGES = 8192 # PAGE_SIZE = 512 -from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument +from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument NUM_INITIAL_TOKENS = 50 diff --git a/indexer/index_glob.py b/mwmbl/indexer/index_glob.py similarity index 83% rename from indexer/index_glob.py rename to mwmbl/indexer/index_glob.py index 31decc3..e9102c2 100644 --- a/indexer/index_glob.py +++ b/mwmbl/indexer/index_glob.py @@ -4,12 +4,13 @@ from glob import glob import bs4 from spacy.lang.en import English -from index import tokenize -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from paths import INDEX_PATH, CRAWL_GLOB +from .index import tokenize +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE +from .paths import INDEX_PATH, CRAWL_GLOB def run(): + # TODO: item_factory argument is unfilled. indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) indexer.create_if_not_exists() nlp = English() diff --git a/indexer/index_queue.py b/mwmbl/indexer/index_queue.py similarity index 74% rename from indexer/index_queue.py rename to mwmbl/indexer/index_queue.py index eadfd75..f048e28 100644 --- a/indexer/index_queue.py +++ b/mwmbl/indexer/index_queue.py @@ -3,10 +3,10 @@ Index items in the file-system queue """ from spacy.lang.en import English -from fsqueue import FSQueue, ZstdJsonSerializer -from index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH +from .fsqueue import FSQueue, ZstdJsonSerializer +from .index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE +from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH def get_queue_items(): diff --git a/indexer/indexcc.py b/mwmbl/indexer/indexcc.py similarity index 82% rename from indexer/indexcc.py rename to mwmbl/indexer/indexcc.py index 549bb0e..4f68025 100644 --- a/indexer/indexcc.py +++ b/mwmbl/indexer/indexcc.py @@ -7,10 +7,10 @@ from logging import getLogger import spacy -from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError -from index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document -from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH +from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError +from .index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document +from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) diff --git a/indexer/paths.py b/mwmbl/indexer/paths.py similarity index 100% rename from indexer/paths.py rename to mwmbl/indexer/paths.py diff --git a/indexer/wiki.py b/mwmbl/indexer/wiki.py similarity index 85% rename from indexer/wiki.py rename to mwmbl/indexer/wiki.py index a3a66ff..93ac1c7 100644 --- a/indexer/wiki.py +++ b/mwmbl/indexer/wiki.py @@ -7,9 +7,9 @@ from urllib.parse import quote from spacy.lang.en import English -from indexer.index import index_titles_urls_and_extracts -from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH +from .index import index_titles_urls_and_extracts +from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE +from .paths import WIKI_TITLES_PATH, INDEX_PATH TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] TITLE_START = '