Merge branch 'master' into user-registration

This commit is contained in:
Daoud Clarke 2022-12-19 21:53:11 +00:00
commit 5eab543f3b
8 changed files with 41 additions and 69 deletions

View file

@ -41,4 +41,4 @@ VOLUME ["/data"]
EXPOSE 5000
# Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl
CMD ["/venv/bin/mwmbl-tinysearchengine"]
CMD ["/venv/bin/mwmbl-tinysearchengine", "--num-pages", "10240000", "--background"]

View file

@ -119,16 +119,29 @@ author (email address is in the git commit history).
Development
===========
### Using Docker
1. Create a new folder called `data` in the root of the repository
2. Download the [index file](https://storage.googleapis.com/mwmbl/index.tinysearch) and place it the new data folder
3. Run `$ docker build . -t mwmbl`
4. Run `$ docker run -p 8080:8080 mwmbl`
### Local Testing
1. Create and activate a python (3.10) environment using any tool you like e.g. poetry,venv, conda etc.
2. Run `$ pip install .`
3. Run `$ mwmbl-tinysearchengine --config config/tinysearchengine.yaml`
This will run against a local test database without running background
tasks to update batches etc.
This is the simplest way to configure postgres, but you can set it up
how you like as long as the `DATABASE_URL` you give is correct for
your configuration.
1. Install postgres and create a user for your current username
2. Install [poetry](https://python-poetry.org/docs/#installation)
3. Run `poetry install` to install dependencies
4. Run `poetry shell` in the root directory to enter the virtual environment
5. Run `$ DATABASE_URL="postgres://username@" python -m mwmbl.main` replacing "username" with your username.
### Using Dokku
Note: this method is not recommended as it is more involved, and your index will not have any data in it unless you
set up a crawler to crawl to your server. You will need to set up your own Backblaze or S3 equivalent storage, or
have access to the production keys, which we probably won't give you.
Follow the [deployment instructions](https://github.com/mwmbl/mwmbl/wiki/Deployment)
Frequently Asked Question
=========================

BIN
devdata/index-v2.tinysearch Normal file

Binary file not shown.

View file

@ -6,6 +6,7 @@ We store them in a directory on the local machine.
import gzip
import json
import os
from logging import getLogger
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tempfile import NamedTemporaryFile
@ -19,6 +20,9 @@ from mwmbl.indexer.indexdb import IndexDatabase, BatchStatus
from mwmbl.retry import retry_requests
logger = getLogger(__name__)
class BatchCache:
num_threads = 20
@ -30,7 +34,11 @@ class BatchCache:
batches = {}
for url in batch_urls:
path = self.get_path_from_url(url)
data = gzip.GzipFile(path).read()
try:
data = gzip.GzipFile(path).read()
except FileNotFoundError:
logger.exception(f"Missing batch file: {path}")
continue
batch = HashedBatch.parse_raw(data)
batches[url] = batch
return batches

View file

@ -1,49 +0,0 @@
"""
Index data crawled through the Mwmbl crawler.
"""
import json
from logging import getLogger
import spacy
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
logger = getLogger(__name__)
def index_mwmbl_crawl_data():
nlp = spacy.load("en_core_web_sm")
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
link_counts = json.load(open(LINK_COUNT_PATH))
TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
with TinyIndex(Document, INDEX_PATH, 'w') as indexer:
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH)
def get_mwmbl_crawl_titles_urls_and_extracts():
input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
input_queue.unlock_all()
while True:
try:
next_item = input_queue.get()
except FSQueueError as e:
logger.exception(f'Error with item {e.item_id}')
input_queue.error(e.item_id)
continue
if next_item is None:
logger.info('Not more items to process, stopping')
break
item_id, item_data = next_item
logger.info(f'Processing item {item_id}')
for item in item_data['items']:
yield item['title'], item['url'], item['extract']
input_queue.done(item_id)
if __name__ == '__main__':
index_mwmbl_crawl_data()

View file

@ -26,4 +26,4 @@ def run(batch_cache: BatchCache, start_status: BatchStatus, end_status: BatchSta
process(batch_data.values())
index_db.update_batch_status([batch.url for batch in batches], end_status)
index_db.update_batch_status(list(batch_data.keys()), end_status)

View file

@ -16,7 +16,7 @@ from mwmbl.indexer.paths import INDEX_NAME, BATCH_DIR_NAME
from mwmbl.platform import user
from mwmbl.tinysearchengine import search
from mwmbl.tinysearchengine.completer import Completer
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
from mwmbl.tinysearchengine.indexer import TinyIndex, Document, PAGE_SIZE
from mwmbl.tinysearchengine.rank import HeuristicRanker
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@ -26,9 +26,10 @@ MODEL_PATH = Path(__file__).parent / 'resources' / 'model.pickle'
def setup_args():
parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine")
parser.add_argument("--data", help="Path to the tinysearchengine index file", default="/app/storage/")
parser.add_argument("--no-background", help="Disable running the background script to collect data",
parser = argparse.ArgumentParser(description="Mwmbl API server and background task processor")
parser.add_argument("--num-pages", help="Number of pages of memory (4096 bytes) to use for the index", default=2560)
parser.add_argument("--data", help="Path to the data folder for storing index and cached batches", default="./devdata")
parser.add_argument("--background", help="Enable running the background tasks to process batches",
action='store_true')
args = parser.parse_args()
return args
@ -40,7 +41,7 @@ def run():
index_path = Path(args.data) / INDEX_NAME
try:
existing_index = TinyIndex(item_factory=Document, index_path=index_path)
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != NUM_PAGES:
if existing_index.page_size != PAGE_SIZE or existing_index.num_pages != args.num_pages:
print(f"Existing index page sizes ({existing_index.page_size}) and number of pages "
f"({existing_index.num_pages}) does not match - removing.")
os.remove(index_path)
@ -50,11 +51,11 @@ def run():
if existing_index is None:
print("Creating a new index")
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=args.num_pages, page_size=PAGE_SIZE)
url_queue = Queue()
if not args.no_background:
if args.background:
Process(target=background.run, args=(args.data, url_queue)).start()
completer = Completer()

View file

@ -13,7 +13,6 @@ VERSION = 1
METADATA_CONSTANT = b'mwmbl-tiny-search'
METADATA_SIZE = 4096
NUM_PAGES = 10_240_000
PAGE_SIZE = 4096