Pārlūkot izejas kodu

Merge pull request #18 from nitred/mwmbl-package

renamed package to mwmbl
Daoud Clarke 3 gadi atpakaļ
vecāks
revīzija
da8797f5ef
38 mainītis faili ar 54 papildinājumiem un 53 dzēšanām
  1. 1 1
      Dockerfile
  2. 2 2
      analyse/inspect_index.py
  3. 2 2
      analyse/make_curl.py
  4. 5 5
      analyse/performance.py
  5. 0 0
      mwmbl/__init__.py
  6. 0 0
      mwmbl/indexer/__init__.py
  7. 0 0
      mwmbl/indexer/bootstrap.sh
  8. 1 1
      mwmbl/indexer/crawl.py
  9. 0 0
      mwmbl/indexer/deploy.sh
  10. 0 0
      mwmbl/indexer/domains.py
  11. 0 0
      mwmbl/indexer/domains/__init__.py
  12. 2 2
      mwmbl/indexer/domains/domain_titles.py
  13. 2 2
      mwmbl/indexer/domains/queue_domains.py
  14. 0 0
      mwmbl/indexer/extract.py
  15. 3 3
      mwmbl/indexer/extract_local.py
  16. 0 0
      mwmbl/indexer/extract_process.py
  17. 0 0
      mwmbl/indexer/fsqueue.py
  18. 0 0
      mwmbl/indexer/hn-top-domains-filtered.py
  19. 1 1
      mwmbl/indexer/index.py
  20. 4 3
      mwmbl/indexer/index_glob.py
  21. 4 4
      mwmbl/indexer/index_queue.py
  22. 4 4
      mwmbl/indexer/indexcc.py
  23. 0 0
      mwmbl/indexer/paths.py
  24. 3 3
      mwmbl/indexer/wiki.py
  25. 0 0
      mwmbl/tinysearchengine/__init__.py
  26. 17 0
      mwmbl/tinysearchengine/app.py
  27. 1 1
      mwmbl/tinysearchengine/create_app.py
  28. 1 1
      mwmbl/tinysearchengine/indexer.py
  29. 0 0
      mwmbl/tinysearchengine/static/index.css
  30. 0 0
      mwmbl/tinysearchengine/static/index.html
  31. 0 0
      mwmbl/tinysearchengine/static/index.js
  32. 0 0
      mwmbl/tinysearchengine/static/landing.html
  33. 0 0
      mwmbl/tinysearchengine/static/plugin.xml
  34. 0 0
      mwmbl/tinysearchengine/static/search.html
  35. 0 0
      mwmbl/tinysearchengine/static/typeahead.css
  36. 0 0
      mwmbl/tinysearchengine/static/typeahead.js
  37. 1 1
      pyproject.toml
  38. 0 17
      tinysearchengine/app.py

+ 1 - 1
Dockerfile

@@ -31,4 +31,4 @@ COPY data /data
 #COPY docker-entrypoint.sh wsgi.py ./
 #CMD ["./docker-entrypoint.sh"]
 
-CMD ["/venv/bin/python", "-m", "tinysearchengine.app", "/data/index.tinysearch"]
+CMD ["/venv/bin/python", "-m", "mwmbl.tinysearchengine.app", "/data/index.tinysearch"]

+ 2 - 2
analyse/inspect_index.py

@@ -1,5 +1,5 @@
-from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
-from indexer.paths import INDEX_PATH
+from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
+from mwmbl.indexer.paths import INDEX_PATH
 
 
 def get_items():

+ 2 - 2
analyse/make_curl.py

@@ -5,8 +5,8 @@ import os
 from itertools import islice
 from urllib.parse import quote
 
-from indexer.paths import DATA_DIR
-from indexer.wiki import get_wiki_titles_and_urls
+from mwmbl.indexer.paths import DATA_DIR
+from mwmbl.indexer.wiki import get_wiki_titles_and_urls
 
 URL_TEMPLATE = "http://localhost:8000/complete?q={}"
 CURL_FILE = os.path.join(DATA_DIR, "urls.curl")

+ 5 - 5
analyse/performance.py

@@ -8,11 +8,11 @@ import numpy as np
 from spacy.lang.en import English
 from starlette.testclient import TestClient
 
-from tinysearchengine import create_app
-from indexer.fsqueue import ZstdJsonSerializer
-from indexer.index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
-from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
+from mwmbl.tinysearchengine import create_app
+from mwmbl.indexer.fsqueue import ZstdJsonSerializer
+from mwmbl.indexer.index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
+from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
 
 NUM_DOCUMENTS = 30000
 NUM_PAGES_FOR_STATS = 10

+ 0 - 0
indexer/__init__.py → mwmbl/__init__.py


+ 0 - 0
indexer/domains/__init__.py → mwmbl/indexer/__init__.py


+ 0 - 0
indexer/bootstrap.sh → mwmbl/indexer/bootstrap.sh


+ 1 - 1
indexer/crawl.py → mwmbl/indexer/crawl.py

@@ -10,7 +10,7 @@ from traceback import print_tb, print_exc
 import pandas as pd
 import requests
 
-from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
+from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
 
 
 def crawl():

+ 0 - 0
indexer/deploy.sh → mwmbl/indexer/deploy.sh


+ 0 - 0
indexer/domains.py → mwmbl/indexer/domains.py


+ 0 - 0
tinysearchengine/__init__.py → mwmbl/indexer/domains/__init__.py


+ 2 - 2
indexer/domains/domain_titles.py → mwmbl/indexer/domains/domain_titles.py

@@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
 import bs4
 import requests
 
-from indexer.fsqueue import FSQueue, ZstdJsonSerializer
-from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
+from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
+from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
 
 NUM_PROCESSES = 10
 

+ 2 - 2
indexer/domains/queue_domains.py → mwmbl/indexer/domains/queue_domains.py

@@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
 import csv
 import gzip
 
-from indexer.fsqueue import FSQueue, ZstdJsonSerializer
-from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
+from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer
+from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
 
 BATCH_SIZE = 250
 

+ 0 - 0
indexer/extract.py → mwmbl/indexer/extract.py


+ 3 - 3
indexer/extract_local.py → mwmbl/indexer/extract_local.py

@@ -4,9 +4,9 @@ import os
 from glob import glob
 from multiprocessing import Process, Lock
 
-from extract_process import fetch_process_warc_records
-from fsqueue import FSQueue, GzipJsonRowSerializer
-from paths import DATA_DIR
+from .extract_process import fetch_process_warc_records
+from .fsqueue import FSQueue, GzipJsonRowSerializer
+from .paths import DATA_DIR
 
 ARCHIVE_INFO_GLOB = 'outputs/records/*.gz'
 

+ 0 - 0
indexer/extract_process.py → mwmbl/indexer/extract_process.py


+ 0 - 0
indexer/fsqueue.py → mwmbl/indexer/fsqueue.py


+ 0 - 0
indexer/hn-top-domains-filtered.py → mwmbl/indexer/hn-top-domains-filtered.py


+ 1 - 1
indexer/index.py → mwmbl/indexer/index.py

@@ -10,7 +10,7 @@ import pandas as pd
 
 # NUM_PAGES = 8192
 # PAGE_SIZE = 512
-from tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
+from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
 
 NUM_INITIAL_TOKENS = 50
 

+ 4 - 3
indexer/index_glob.py → mwmbl/indexer/index_glob.py

@@ -4,12 +4,13 @@ from glob import glob
 import bs4
 from spacy.lang.en import English
 
-from index import tokenize
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from paths import INDEX_PATH, CRAWL_GLOB
+from .index import tokenize
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
+from .paths import INDEX_PATH, CRAWL_GLOB
 
 
 def run():
+    # TODO: item_factory argument is unfilled.
     indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
     indexer.create_if_not_exists()
     nlp = English()

+ 4 - 4
indexer/index_queue.py → mwmbl/indexer/index_queue.py

@@ -3,10 +3,10 @@ Index items in the file-system queue
 """
 from spacy.lang.en import English
 
-from fsqueue import FSQueue, ZstdJsonSerializer
-from index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
+from .fsqueue import FSQueue, ZstdJsonSerializer
+from .index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
+from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH
 
 
 def get_queue_items():

+ 4 - 4
indexer/indexcc.py → mwmbl/indexer/indexcc.py

@@ -7,10 +7,10 @@ from logging import getLogger
 
 import spacy
 
-from fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
-from index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
-from paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
+from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError
+from .index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document
+from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH
 
 
 logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)

+ 0 - 0
indexer/paths.py → mwmbl/indexer/paths.py


+ 3 - 3
indexer/wiki.py → mwmbl/indexer/wiki.py

@@ -7,9 +7,9 @@ from urllib.parse import quote
 
 from spacy.lang.en import English
 
-from indexer.index import index_titles_urls_and_extracts
-from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
-from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
+from .index import index_titles_urls_and_extracts
+from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
+from .paths import WIKI_TITLES_PATH, INDEX_PATH
 
 TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
 TITLE_START = '<title>Wikipedia: '

+ 0 - 0
mwmbl/tinysearchengine/__init__.py


+ 17 - 0
mwmbl/tinysearchengine/app.py

@@ -0,0 +1,17 @@
+import logging
+import sys
+
+import uvicorn
+
+from mwmbl.tinysearchengine import create_app
+from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
+
+logging.basicConfig()
+
+
+index_path = sys.argv[1]
+tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
+app = create_app.create(tiny_index)
+
+if __name__ == "__main__":
+    uvicorn.run("mwmbl.tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")

+ 1 - 1
tinysearchengine/create_app.py → mwmbl/tinysearchengine/create_app.py

@@ -7,7 +7,7 @@ from fastapi import FastAPI
 from starlette.responses import FileResponse
 from starlette.staticfiles import StaticFiles
 
-from tinysearchengine.indexer import TinyIndex, Document
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 
 logger = getLogger(__name__)
 

+ 1 - 1
tinysearchengine/indexer.py → mwmbl/tinysearchengine/indexer.py

@@ -6,7 +6,7 @@ from pathlib import Path
 from typing import TypeVar, Generic, Callable, List
 
 import mmh3
-from zstandard import ZstdDecompressor
+from zstandard import ZstdDecompressor, ZstdCompressor, ZstdError
 
 
 NUM_PAGES = 25600

+ 0 - 0
tinysearchengine/static/index.css → mwmbl/tinysearchengine/static/index.css


+ 0 - 0
tinysearchengine/static/index.html → mwmbl/tinysearchengine/static/index.html


+ 0 - 0
tinysearchengine/static/index.js → mwmbl/tinysearchengine/static/index.js


+ 0 - 0
tinysearchengine/static/landing.html → mwmbl/tinysearchengine/static/landing.html


+ 0 - 0
tinysearchengine/static/plugin.xml → mwmbl/tinysearchengine/static/plugin.xml


+ 0 - 0
tinysearchengine/static/search.html → mwmbl/tinysearchengine/static/search.html


+ 0 - 0
tinysearchengine/static/typeahead.css → mwmbl/tinysearchengine/static/typeahead.css


+ 0 - 0
tinysearchengine/static/typeahead.js → mwmbl/tinysearchengine/static/typeahead.js


+ 1 - 1
pyproject.toml

@@ -1,5 +1,5 @@
 [tool.poetry]
-name = "tinysearchengine"
+name = "mwmbl"
 version = "0.1.0"
 description = ""
 authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]

+ 0 - 17
tinysearchengine/app.py

@@ -1,17 +0,0 @@
-import logging
-import sys
-
-import uvicorn
-
-from tinysearchengine import create_app
-from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
-
-logging.basicConfig()
-
-
-index_path = sys.argv[1]
-tiny_index = TinyIndex(Document, index_path, NUM_PAGES, PAGE_SIZE)
-app = create_app.create(tiny_index)
-
-if __name__ == "__main__":
-    uvicorn.run("tinysearchengine.app:app", host="0.0.0.0", port=8080, log_level="info")