renamed package to mwmbl

- renamed package to mwmbl in pyproject.toml
- tinysearchengine and indexer modules have been moved into mwmbl package folder
- analyse module has been left as is in the root of the repo
- import statements in tinysearchengine now use mwmbl.tinysearchengine
- import statements in indexer now use mwmbl.indexer or mwmbl.tinysearchengine or relative imports like .paths
- import statements in analyse now use mwmbl.indexer or mwmbl.tinysearchengine
- final CMD in Dockerfile now uses updated path mwmbl.tinysearchengine.app
- fixed a couple of import statement errors in tinysearchengine/indexer.py
This commit is contained in:
nitred 2021-12-28 12:02:48 +01:00
parent acb2d19470
commit 11eedcde84
38 changed files with 54 additions and 53 deletions

View file

@ -31,4 +31,4 @@ COPY data /data
#COPY docker-entrypoint.sh wsgi.py ./
#CMD ["./docker-entrypoint.sh"]
CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]
CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"]

View file

@ -1,5 +1,5 @@
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from indexer.paths import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from mwmbl.indexer.paths import INDEX_PATH
def get_items():

View file

@ -5,8 +5,8 @@ import os
from itertools import islice
from urllib.parse import quote
from indexer.paths import DATA_DIR
from indexer.wiki import get_wiki_titles_and_urls
from mwmbl.indexer.paths import DATA_DIR
from mwmbl.indexer.wiki import get_wiki_titles_and_urls
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
CURL_FILE = os.path.join(DATA_DIR, "urls.curl")

View file

@ -8,11 +8,11 @@ import numpy as np
from spacy.lang.en import English
from starlette.testclient import TestClient
from tinysearchengine import create_app
from indexer.fsqueue import ZstdJsonSerializer
from indexer.index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
from mwmbl.tinysearchengine import create_app
from mwmbl.indexer.fsqueue import ZstdJsonSerializer
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10

View file

@ -10,7 +10,7 @@ from traceback import print_tb, print_exc
import pandas as pd
import requests
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
def crawl():

View file

@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
import bs4
import requests
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
NUM_PROCESSES = 10

View file

@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
import csv
import gzip
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
BATCH_SIZE = 250

View file

@ -4,9 +4,9 @@ import os
from glob import glob
from multiprocessing import Process, Lock
from extract_process import fetch_process_warc_records
from fsqueue import FSQueue, GzipJsonRowSerializer
from paths import DATA_DIR
from .extract_process import fetch_process_warc_records
from .fsqueue import FSQueue, GzipJsonRowSerializer
from .paths import DATA_DIR
ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'

View file

@ -10,7 +10,7 @@ import pandas as pd
# NUM_PAGES = 8192
# PAGE_SIZE = 512
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
NUM_INITIAL_TOKENS = 50

View file

@ -4,12 +4,13 @@ from glob import glob
import bs4
from spacy.lang.en import English
from index import tokenize
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from paths import INDEX_PATH, CRAWL_GLOB
from .index import tokenize
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from .paths import INDEX_PATH, CRAWL_GLOB
def run():
# TODO: item_factory argument is unfilled.
indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
indexer.create_if_not_exists()
nlp = English()

View file

@ -3,10 +3,10 @@ Index items in the file-system queue
"""
from spacy.lang.en import English
from fsqueue import FSQueue, ZstdJsonSerializer
from index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
from .fsqueue import FSQueue, ZstdJsonSerializer
from .index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
def get_queue_items():

View file

@ -7,10 +7,10 @@ from logging import getLogger
import spacy
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
from index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
from .index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

View file

@ -7,9 +7,9 @@ from urllib.parse import quote
from spacy.lang.en import English
from indexer.index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
from .index import index_titles_urls_and_extracts
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from .paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
TITLE_START = '<title>Wikipedia: '

View file

View file

@ -0,0 +1,17 @@
import logging
import sys
import uvicorn
from mwmbl.tinysearchengine import create_app
from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
logging.basicConfig()
index_path = sys.argv[1]
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
if __name__ == "__main__":
uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")

View file

@ -7,7 +7,7 @@ from fastapi import FastAPI
from starlette.responses import FileResponse
from starlette.staticfiles import StaticFiles
from tinysearchengine.indexer import TinyIndex, Document
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
logger = getLogger(__name__)

View file

@ -6,7 +6,7 @@ from pathlib import Path
from typing import TypeVar, Generic, Callable, List
import mmh3
from zstandard import ZstdDecompressor
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
NUM_PAGES = 25600

View file

@ -1,5 +1,5 @@
[tool.poetry]
name = "tinysearchengine"
name = "mwmbl"
version = "0.1.0"
description = ""
authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]

View file

@ -1,17 +0,0 @@
import logging
import sys
import uvicorn
from tinysearchengine import create_app
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
logging.basicConfig()
index_path = sys.argv[1]
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
app = create_app.create(tiny_index)
if __name__ == "__main__":
uvicorn.run("tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")