Make it easier to rum mwmbl locally

This commit is contained in:
Daoud Clarke 2022-12-07 20:01:31 +00:00
parent c0f89ba6c3
commit a50bc28436
8 changed files with 37 additions and 66 deletions

View file

@ -41,4 +41,4 @@ VOLUME ["/data"]
EXPOSE 5000
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
CMD ["/venv/bin/mwmbl-tinysearchengine"]
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background"]

View file

@ -120,15 +120,27 @@ Development
===========
### Using Docker
WARNING: this uses production settings and creates a **very** large index (around 40Gb)
1. Create a new folder called `data` in the root of the repository
2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder
3. Run `$ docker build . -t mwmbl`
4. Run `$ docker run -p 8080:8080 mwmbl`
2. Run `$ docker build . -t mwmbl`
3. Run `$ docker run -p 8080:8080 mwmbl`
### Local Testing
1. Create and activate a python (3.10) environment using any tool you like e.g. poetry,venv, conda etc.
2. Run `$ pip install .`
3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml`
This will run against a local test database without running background
tasks to update batches etc.
This is the simplest way to configure postgres, but you can set it up
how you like as long as the `DATABASE_URL` you give is correct for
your configuration.
1. Install postgres and create a user for your current username
2. Install [poetry](https://python-poetry.org/docs/#installation)
3. Run `poetry install` to install dependencies
4. Run `poetry shell` in the root directory to enter the virtual environment
5. Run `$ DATABASE_URL="postgres://username@" python -m mwmbl.main` replacing "username" with your username.
Frequently Asked Question
=========================

BIN
devdata/index-v2.tinysearch Normal file

Binary file not shown.

View file

@ -6,6 +6,7 @@ We store them in a directory on the local machine.
import gzip
import json
import os
from logging import getLogger
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tempfile import NamedTemporaryFile
@ -19,6 +20,9 @@ from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
from mwmbl.retry import retry_requests
logger = getLogger(__name__)
class BatchCache:
num_threads = 20
@ -30,7 +34,11 @@ class BatchCache:
batches = {}
for url in batch_urls:
path = self.get_path_from_url(url)
data = gzip.GzipFile(path).read()
try:
data = gzip.GzipFile(path).read()
except FileNotFoundError:
logger.exception(f"Missing batch file: {path}")
continue
batch = HashedBatch.parse_raw(data)
batches[url] = batch
return batches

View file

@ -1,49 +0,0 @@
"""
Index data crawled through the Mwmbl crawler.
"""
import json
from logging import getLogger
import spacy
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
logger = getLogger(__name__)
def index_mwmbl_crawl_data():
nlp = spacy.load("en_core_web_sm")
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
link_counts = json.load(open(LINK_COUNT_PATH))
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
def get_mwmbl_crawl_titles_urls_and_extracts():
input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
input_queue.unlock_all()
while True:
try:
next_item = input_queue.get()
except FSQueueError as e:
logger.exception(f'Error with item {e.item_id}')
input_queue.error(e.item_id)
continue
if next_item is None:
logger.info('Not more items to process, stopping')
break
item_id, item_data = next_item
logger.info(f'Processing item {item_id}')
for item in item_data['items']:
yield item['title'], item['url'], item['extract']
input_queue.done(item_id)
if __name__ == '__main__':
index_mwmbl_crawl_data()

View file

@ -26,4 +26,4 @@ def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchSta
process(batch_data.values())
index_db.update_batch_status([batch.url for batch in batches], end_status)
index_db.update_batch_status(list(batch_data.keys()), end_status)

View file

@ -14,7 +14,7 @@ from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@ -24,9 +24,10 @@ MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
def setup_args():
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
parser.add_argument("--data", help="Path to the tinysearchengine index file", default="/app/storage/")
parser.add_argument("--no-background", help="Disable running the background script to collect data",
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
parser.add_argument("--num-pages", help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
parser.add_argument("--background", help="Enable running the background tasks to process batches",
action='store_true')
args = parser.parse_args()
return args
@ -38,7 +39,7 @@ def run():
index_path = Path(args.data) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != NUM_PAGES:
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
print(f"Existing index page sizes ({existing_index.page_size}) and number of pages "
f"({existing_index.num_pages}) does not match - removing.")
os.remove(index_path)
@ -48,11 +49,11 @@ def run():
if existing_index is None:
print("Creating a new index")
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
url_queue = Queue()
if not args.no_background:
if args.background:
Process(target=background.run, args=(args.data, url_queue)).start()
completer = Completer()

View file

@ -13,7 +13,6 @@ VERSION = 1
METADATA_CONSTANT = b'mwmbl-tiny-search'
METADATA_SIZE = 4096
NUM_PAGES = 10_240_000
PAGE_SIZE = 4096