renamed package to mwmbl
- renamed package to mwmbl in pyproject.toml - tinysearchengine and indexer modules have been moved into mwmbl package folder - analyse module has been left as is in the root of the repo - import statements in tinysearchengine now use mwmbl.tinysearchengine - import statements in indexer now use mwmbl.indexer or mwmbl.tinysearchengine or relative imports like .paths - import statements in analyse now use mwmbl.indexer or mwmbl.tinysearchengine - final CMD in Dockerfile now uses updated path mwmbl.tinysearchengine.app - fixed a couple of import statement errors in tinysearchengine/indexer.py
This commit is contained in:
parent
acb2d19470
commit
11eedcde84
38 changed files with 54 additions and 53 deletions
|
@ -31,4 +31,4 @@ COPY data /data
|
|||
#COPY docker-entrypoint.sh wsgi.py ./
|
||||
#CMD ["./docker-entrypoint.sh"]
|
||||
|
||||
CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]
|
||||
CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
from indexer.paths import INDEX_PATH
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
from mwmbl.indexer.paths import INDEX_PATH
|
||||
|
||||
|
||||
def get_items():
|
||||
|
|
|
@ -5,8 +5,8 @@ import os
|
|||
from itertools import islice
|
||||
from urllib.parse import quote
|
||||
|
||||
from indexer.paths import DATA_DIR
|
||||
from indexer.wiki import get_wiki_titles_and_urls
|
||||
from mwmbl.indexer.paths import DATA_DIR
|
||||
from mwmbl.indexer.wiki import get_wiki_titles_and_urls
|
||||
|
||||
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
|
||||
CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
|
||||
|
|
|
@ -8,11 +8,11 @@ import numpy as np
|
|||
from spacy.lang.en import English
|
||||
from starlette.testclient import TestClient
|
||||
|
||||
from tinysearchengine import create_app
|
||||
from indexer.fsqueue import ZstdJsonSerializer
|
||||
from indexer.index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||
from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
from mwmbl.tinysearchengine import create_app
|
||||
from mwmbl.indexer.fsqueue import ZstdJsonSerializer
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||
from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
||||
|
|
|
@ -10,7 +10,7 @@ from traceback import print_tb, print_exc
|
|||
import pandas as pd
|
||||
import requests
|
||||
|
||||
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
|
||||
from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
|
||||
|
||||
|
||||
def crawl():
|
|
@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
|
|||
import bs4
|
||||
import requests
|
||||
|
||||
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||
from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||
|
||||
NUM_PROCESSES = 10
|
||||
|
|
@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
|
|||
import csv
|
||||
import gzip
|
||||
|
||||
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
||||
from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
||||
|
||||
BATCH_SIZE = 250
|
||||
|
|
@ -4,9 +4,9 @@ import os
|
|||
from glob import glob
|
||||
from multiprocessing import Process, Lock
|
||||
|
||||
from extract_process import fetch_process_warc_records
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer
|
||||
from paths import DATA_DIR
|
||||
from .extract_process import fetch_process_warc_records
|
||||
from .fsqueue import FSQueue, GzipJsonRowSerializer
|
||||
from .paths import DATA_DIR
|
||||
|
||||
ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
|
||||
|
|
@ -10,7 +10,7 @@ import pandas as pd
|
|||
|
||||
# NUM_PAGES = 8192
|
||||
# PAGE_SIZE = 512
|
||||
from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
|
||||
|
||||
NUM_INITIAL_TOKENS = 50
|
||||
|
|
@ -4,12 +4,13 @@ from glob import glob
|
|||
import bs4
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import tokenize
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from paths import INDEX_PATH, CRAWL_GLOB
|
||||
from .index import tokenize
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from .paths import INDEX_PATH, CRAWL_GLOB
|
||||
|
||||
|
||||
def run():
|
||||
# TODO: item_factory argument is unfilled.
|
||||
indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
indexer.create_if_not_exists()
|
||||
nlp = English()
|
|
@ -3,10 +3,10 @@ Index items in the file-system queue
|
|||
"""
|
||||
from spacy.lang.en import English
|
||||
|
||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
|
||||
from .fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from .index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
|
||||
|
||||
|
||||
def get_queue_items():
|
|
@ -7,10 +7,10 @@ from logging import getLogger
|
|||
|
||||
import spacy
|
||||
|
||||
from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
|
||||
from index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
|
||||
from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
||||
from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
|
||||
from .index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
|
||||
from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
|
|
@ -7,9 +7,9 @@ from urllib.parse import quote
|
|||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from indexer.index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
from .index import index_titles_urls_and_extracts
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from .paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
TITLE_START = '<title>Wikipedia: '
|
0
mwmbl/tinysearchengine/__init__.py
Normal file
0
mwmbl/tinysearchengine/__init__.py
Normal file
17
mwmbl/tinysearchengine/app.py
Normal file
17
mwmbl/tinysearchengine/app.py
Normal file
|
@ -0,0 +1,17 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
import uvicorn
|
||||
|
||||
from mwmbl.tinysearchengine import create_app
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
||||
index_path = sys.argv[1]
|
||||
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
|
||||
app = create_app.create(tiny_index)
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")
|
|
@ -7,7 +7,7 @@ from fastapi import FastAPI
|
|||
from starlette.responses import FileResponse
|
||||
from starlette.staticfiles import StaticFiles
|
||||
|
||||
from tinysearchengine.indexer import TinyIndex, Document
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
from typing import TypeVar, Generic, Callable, List
|
||||
|
||||
import mmh3
|
||||
from zstandard import ZstdDecompressor
|
||||
from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
|
||||
|
||||
|
||||
NUM_PAGES = 25600
|
|
@ -1,5 +1,5 @@
|
|||
[tool.poetry]
|
||||
name = "tinysearchengine"
|
||||
name = "mwmbl"
|
||||
version = "0.1.0"
|
||||
description = ""
|
||||
authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
|
||||
|
|
|
@ -1,17 +0,0 @@
|
|||
import logging
|
||||
import sys
|
||||
|
||||
import uvicorn
|
||||
|
||||
from tinysearchengine import create_app
|
||||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
|
||||
index_path = sys.argv[1]
|
||||
tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
|
||||
app = create_app.create(tiny_index)
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run("tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")
|
Loading…
Reference in a new issue