Browse Source

Merge pull request #18 from nitred/mwmbl-package

renamed package to mwmbl
Daoud Clarke 3 years ago
parent
commit
da8797f5ef

+ 1 - 1
Dockerfile

@@ -31,4 +31,4 @@ COPY data /data
 #COPY docker-entrypoint.sh wsgi.py ./
 #COPY docker-entrypoint.sh wsgi.py ./
 #CMD ["./docker-entrypoint.sh"]
 #CMD ["./docker-entrypoint.sh"]
 
 
-CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]
+CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"]

+ 2 - 2
analyse/inspect_index.py

@@ -1,5 +1,5 @@
-from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
-from indexer.paths import INDEX_PATH
+from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
+from mwmbl.indexer.paths import INDEX_PATH
 
 
 
 
 def get_items():
 def get_items():

+ 2 - 2
analyse/make_curl.py

@@ -5,8 +5,8 @@ import os
 from itertools import islice
 from itertools import islice
 from urllib.parse import quote
 from urllib.parse import quote
 
 
-from indexer.paths import DATA_DIR
-from indexer.wiki import get_wiki_titles_and_urls
+from mwmbl.indexer.paths import DATA_DIR
+from mwmbl.indexer.wiki import get_wiki_titles_and_urls
 
 
 URL_TEMPLATE = "http://localhost:8000/complete?q={}"
 URL_TEMPLATE = "http://localhost:8000/complete?q={}"
 CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
 CURL_FILE = os.path.join(DATA_DIR, "urls.curl")

+ 5 - 5
analyse/performance.py

@@ -8,11 +8,11 @@ import numpy as np
 from spacy.lang.en import English
 from spacy.lang.en import English
 from starlette.testclient import TestClient
 from starlette.testclient import TestClient
 
 
-from tinysearchengine import create_app
-from indexer.fsqueue import ZstdJsonSerializer
-from indexer.index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
-from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
+from mwmbl.tinysearchengine import create_app
+from mwmbl.indexer.fsqueue import ZstdJsonSerializer
+from mwmbl.indexer.index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
+from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
 
 
 NUM_DOCUMENTS = 30000
 NUM_DOCUMENTS = 30000
 NUM_PAGES_FOR_STATS = 10
 NUM_PAGES_FOR_STATS = 10

+ 0 - 0
indexer/__init__.py → mwmbl/__init__.py


+ 0 - 0
indexer/domains/__init__.py → mwmbl/indexer/__init__.py


+ 0 - 0
indexer/bootstrap.sh → mwmbl/indexer/bootstrap.sh


+ 1 - 1
indexer/crawl.py → mwmbl/indexer/crawl.py

@@ -10,7 +10,7 @@ from traceback import print_tb, print_exc
 import pandas as pd
 import pandas as pd
 import requests
 import requests
 
 
-from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
+from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
 
 
 
 
 def crawl():
 def crawl():

+ 0 - 0
indexer/deploy.sh → mwmbl/indexer/deploy.sh


+ 0 - 0
indexer/domains.py → mwmbl/indexer/domains.py


+ 0 - 0
tinysearchengine/__init__.py → mwmbl/indexer/domains/__init__.py


+ 2 - 2
indexer/domains/domain_titles.py → mwmbl/indexer/domains/domain_titles.py

@@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
 import bs4
 import bs4
 import requests
 import requests
 
 
-from indexer.fsqueue import FSQueue, ZstdJsonSerializer
-from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
+from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
+from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
 
 
 NUM_PROCESSES = 10
 NUM_PROCESSES = 10
 
 

+ 2 - 2
indexer/domains/queue_domains.py → mwmbl/indexer/domains/queue_domains.py

@@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
 import csv
 import csv
 import gzip
 import gzip
 
 
-from indexer.fsqueue import FSQueue, ZstdJsonSerializer
-from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
+from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
+from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
 
 
 BATCH_SIZE = 250
 BATCH_SIZE = 250
 
 

+ 0 - 0
indexer/extract.py → mwmbl/indexer/extract.py


+ 3 - 3
indexer/extract_local.py → mwmbl/indexer/extract_local.py

@@ -4,9 +4,9 @@ import os
 from glob import glob
 from glob import glob
 from multiprocessing import Process, Lock
 from multiprocessing import Process, Lock
 
 
-from extract_process import fetch_process_warc_records
-from fsqueue import FSQueue, GzipJsonRowSerializer
-from paths import DATA_DIR
+from .extract_process import fetch_process_warc_records
+from .fsqueue import FSQueue, GzipJsonRowSerializer
+from .paths import DATA_DIR
 
 
 ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
 ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
 
 

+ 0 - 0
indexer/extract_process.py → mwmbl/indexer/extract_process.py


+ 0 - 0
indexer/fsqueue.py → mwmbl/indexer/fsqueue.py


+ 0 - 0
indexer/hn-top-domains-filtered.py → mwmbl/indexer/hn-top-domains-filtered.py


+ 1 - 1
indexer/index.py → mwmbl/indexer/index.py

@@ -10,7 +10,7 @@ import pandas as pd
 
 
 # NUM_PAGES = 8192
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
 # PAGE_SIZE = 512
-from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
+from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
 
 
 NUM_INITIAL_TOKENS = 50
 NUM_INITIAL_TOKENS = 50
 
 

+ 4 - 3
indexer/index_glob.py → mwmbl/indexer/index_glob.py

@@ -4,12 +4,13 @@ from glob import glob
 import bs4
 import bs4
 from spacy.lang.en import English
 from spacy.lang.en import English
 
 
-from index import tokenize
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from paths import INDEX_PATH, CRAWL_GLOB
+from .index import tokenize
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
+from .paths import INDEX_PATH, CRAWL_GLOB
 
 
 
 
 def run():
 def run():
+    # TODO: item_factory argument is unfilled.
     indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     indexer.create_if_not_exists()
     indexer.create_if_not_exists()
     nlp = English()
     nlp = English()

+ 4 - 4
indexer/index_queue.py → mwmbl/indexer/index_queue.py

@@ -3,10 +3,10 @@ Index items in the file-system queue
 """
 """
 from spacy.lang.en import English
 from spacy.lang.en import English
 
 
-from fsqueue import FSQueue, ZstdJsonSerializer
-from index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
+from .fsqueue import FSQueue, ZstdJsonSerializer
+from .index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
+from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
 
 
 
 
 def get_queue_items():
 def get_queue_items():

+ 4 - 4
indexer/indexcc.py → mwmbl/indexer/indexcc.py

@@ -7,10 +7,10 @@ from logging import getLogger
 
 
 import spacy
 import spacy
 
 
-from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
-from index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
-from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
+from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
+from .index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
+from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
 
 
 
 
 logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
 logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

+ 0 - 0
indexer/paths.py → mwmbl/indexer/paths.py


+ 3 - 3
indexer/wiki.py → mwmbl/indexer/wiki.py

@@ -7,9 +7,9 @@ from urllib.parse import quote
 
 
 from spacy.lang.en import English
 from spacy.lang.en import English
 
 
-from indexer.index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
+from .index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
+from .paths import WIKI_TITLES_PATH, INDEX_PATH
 
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
 TITLE_START = '<title>Wikipedia: '
 TITLE_START = '<title>Wikipedia: '

+ 0 - 0
mwmbl/tinysearchengine/__init__.py


+ 17 - 0
mwmbl/tinysearchengine/app.py

@@ -0,0 +1,17 @@
+import logging
+import sys
+
+import uvicorn
+
+from mwmbl.tinysearchengine import create_app
+from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
+
+logging.basicConfig()
+
+
+index_path = sys.argv[1]
+tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
+app = create_app.create(tiny_index)
+
+if __name__ == "__main__":
+    uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")

+ 1 - 1
tinysearchengine/create_app.py → mwmbl/tinysearchengine/create_app.py

@@ -7,7 +7,7 @@ from fastapi import FastAPI
 from starlette.responses import FileResponse
 from starlette.responses import FileResponse
 from starlette.staticfiles import StaticFiles
 from starlette.staticfiles import StaticFiles
 
 
-from tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 
 
 logger = getLogger(__name__)
 logger = getLogger(__name__)
 
 

+ 1 - 1
tinysearchengine/indexer.py → mwmbl/tinysearchengine/indexer.py

@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import TypeVar, Generic, Callable, List
 from typing import TypeVar, Generic, Callable, List
 
 
 import mmh3
 import mmh3
-from zstandard import ZstdDecompressor
+from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
 
 
 
 
 NUM_PAGES = 25600
 NUM_PAGES = 25600

+ 0 - 0
tinysearchengine/static/index.css → mwmbl/tinysearchengine/static/index.css


+ 0 - 0
tinysearchengine/static/index.html → mwmbl/tinysearchengine/static/index.html


+ 0 - 0
tinysearchengine/static/index.js → mwmbl/tinysearchengine/static/index.js


+ 0 - 0
tinysearchengine/static/landing.html → mwmbl/tinysearchengine/static/landing.html


+ 0 - 0
tinysearchengine/static/plugin.xml → mwmbl/tinysearchengine/static/plugin.xml


+ 0 - 0
tinysearchengine/static/search.html → mwmbl/tinysearchengine/static/search.html


+ 0 - 0
tinysearchengine/static/typeahead.css → mwmbl/tinysearchengine/static/typeahead.css


+ 0 - 0
tinysearchengine/static/typeahead.js → mwmbl/tinysearchengine/static/typeahead.js


+ 1 - 1
pyproject.toml

@@ -1,5 +1,5 @@
 [tool.poetry]
 [tool.poetry]
-name = "tinysearchengine"
+name = "mwmbl"
 version = "0.1.0"
 version = "0.1.0"
 description = ""
 description = ""
 authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
 authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]

+ 0 - 17
tinysearchengine/app.py

@@ -1,17 +0,0 @@
-import logging
-import sys
-
-import uvicorn
-
-from tinysearchengine import create_app
-from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
-
-logging.basicConfig()
-
-
-index_path = sys.argv[1]
-tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
-app = create_app.create(tiny_index)
-
-if __name__ == "__main__":
-    uvicorn.run("tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")