Make it easier to rum mwmbl locally
This commit is contained in:
parent
c0f89ba6c3
commit
a50bc28436
8 changed files with 37 additions and 66 deletions
|
@ -41,4 +41,4 @@ VOLUME ["/data"]
|
|||
EXPOSE 5000
|
||||
|
||||
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
|
||||
CMD ["/venv/bin/mwmbl-tinysearchengine"]
|
||||
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background"]
|
||||
|
|
24
README.md
24
README.md
|
@ -120,15 +120,27 @@ Development
|
|||
===========
|
||||
|
||||
### Using Docker
|
||||
|
||||
WARNING: this uses production settings and creates a **very** large index (around 40Gb)
|
||||
|
||||
1. Create a new folder called `data` in the root of the repository
|
||||
2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder
|
||||
3. Run `$ docker build . -t mwmbl`
|
||||
4. Run `$ docker run -p 8080:8080 mwmbl`
|
||||
2. Run `$ docker build . -t mwmbl`
|
||||
3. Run `$ docker run -p 8080:8080 mwmbl`
|
||||
|
||||
### Local Testing
|
||||
1. Create and activate a python (3.10) environment using any tool you like e.g. poetry,venv, conda etc.
|
||||
2. Run `$ pip install .`
|
||||
3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml`
|
||||
|
||||
This will run against a local test database without running background
|
||||
tasks to update batches etc.
|
||||
|
||||
This is the simplest way to configure postgres, but you can set it up
|
||||
how you like as long as the `DATABASE_URL` you give is correct for
|
||||
your configuration.
|
||||
|
||||
1. Install postgres and create a user for your current username
|
||||
2. Install [poetry](https://python-poetry.org/docs/#installation)
|
||||
3. Run `poetry install` to install dependencies
|
||||
4. Run `poetry shell` in the root directory to enter the virtual environment
|
||||
5. Run `$ DATABASE_URL="postgres://username@" python -m mwmbl.main` replacing "username" with your username.
|
||||
|
||||
Frequently Asked Question
|
||||
=========================
|
||||
|
|
BIN
devdata/index-v2.tinysearch
Normal file
BIN
devdata/index-v2.tinysearch
Normal file
Binary file not shown.
|
@ -6,6 +6,7 @@ We store them in a directory on the local machine.
|
|||
import gzip
|
||||
import json
|
||||
import os
|
||||
from logging import getLogger
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
|
@ -19,6 +20,9 @@ from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
|
|||
from mwmbl.retry import retry_requests
|
||||
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
class BatchCache:
|
||||
num_threads = 20
|
||||
|
||||
|
@ -30,7 +34,11 @@ class BatchCache:
|
|||
batches = {}
|
||||
for url in batch_urls:
|
||||
path = self.get_path_from_url(url)
|
||||
try:
|
||||
data = gzip.GzipFile(path).read()
|
||||
except FileNotFoundError:
|
||||
logger.exception(f"Missing batch file: {path}")
|
||||
continue
|
||||
batch = HashedBatch.parse_raw(data)
|
||||
batches[url] = batch
|
||||
return batches
|
||||
|
|
|
@ -1,49 +0,0 @@
|
|||
"""
|
||||
Index data crawled through the Mwmbl crawler.
|
||||
"""
|
||||
import json
|
||||
from logging import getLogger
|
||||
|
||||
import spacy
|
||||
|
||||
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
|
||||
from mwmbl.indexer.index import index_titles_urls_and_extracts
|
||||
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||
|
||||
|
||||
logger = getLogger(__name__)
|
||||
|
||||
|
||||
def index_mwmbl_crawl_data():
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
|
||||
link_counts = json.load(open(LINK_COUNT_PATH))
|
||||
|
||||
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
|
||||
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
|
||||
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
|
||||
|
||||
|
||||
def get_mwmbl_crawl_titles_urls_and_extracts():
|
||||
input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
|
||||
input_queue.unlock_all()
|
||||
while True:
|
||||
try:
|
||||
next_item = input_queue.get()
|
||||
except FSQueueError as e:
|
||||
logger.exception(f'Error with item {e.item_id}')
|
||||
input_queue.error(e.item_id)
|
||||
continue
|
||||
if next_item is None:
|
||||
logger.info('Not more items to process, stopping')
|
||||
break
|
||||
item_id, item_data = next_item
|
||||
logger.info(f'Processing item {item_id}')
|
||||
for item in item_data['items']:
|
||||
yield item['title'], item['url'], item['extract']
|
||||
input_queue.done(item_id)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
index_mwmbl_crawl_data()
|
|
@ -26,4 +26,4 @@ def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchSta
|
|||
|
||||
process(batch_data.values())
|
||||
|
||||
index_db.update_batch_status([batch.url for batch in batches], end_status)
|
||||
index_db.update_batch_status(list(batch_data.keys()), end_status)
|
||||
|
|
|
@ -14,7 +14,7 @@ from mwmbl.indexer.batch_cache import BatchCache
|
|||
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
|
||||
from mwmbl.tinysearchengine import search
|
||||
from mwmbl.tinysearchengine.completer import Completer
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
|
||||
from mwmbl.tinysearchengine.rank import HeuristicRanker
|
||||
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
|
@ -24,9 +24,10 @@ MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
|
|||
|
||||
|
||||
def setup_args():
|
||||
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
|
||||
parser.add_argument("--data", help="Path to the tinysearchengine index file", default="/app/storage/")
|
||||
parser.add_argument("--no-background", help="Disable running the background script to collect data",
|
||||
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
|
||||
parser.add_argument("--num-pages", help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
|
||||
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
|
||||
parser.add_argument("--background", help="Enable running the background tasks to process batches",
|
||||
action='store_true')
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
@ -38,7 +39,7 @@ def run():
|
|||
index_path = Path(args.data) / INDEX_NAME
|
||||
try:
|
||||
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
|
||||
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != NUM_PAGES:
|
||||
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
|
||||
print(f"Existing index page sizes ({existing_index.page_size}) and number of pages "
|
||||
f"({existing_index.num_pages}) does not match - removing.")
|
||||
os.remove(index_path)
|
||||
|
@ -48,11 +49,11 @@ def run():
|
|||
|
||||
if existing_index is None:
|
||||
print("Creating a new index")
|
||||
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
|
||||
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
|
||||
|
||||
url_queue = Queue()
|
||||
|
||||
if not args.no_background:
|
||||
if args.background:
|
||||
Process(target=background.run, args=(args.data, url_queue)).start()
|
||||
|
||||
completer = Completer()
|
||||
|
|
|
@ -13,7 +13,6 @@ VERSION = 1
|
|||
METADATA_CONSTANT = b'mwmbl-tiny-search'
|
||||
METADATA_SIZE = 4096
|
||||
|
||||
NUM_PAGES = 10_240_000
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue