Move indexer code to a separate package
This commit is contained in:
parent
8cfb8b7a44
commit
baede32298
22 changed files with 12 additions and 12 deletions
|
@ -1,5 +1,5 @@
|
||||||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||||
from paths import INDEX_PATH
|
from indexer.paths import INDEX_PATH
|
||||||
|
|
||||||
|
|
||||||
def get_items():
|
def get_items():
|
||||||
|
|
|
@ -5,8 +5,8 @@ import os
|
||||||
from itertools import islice
|
from itertools import islice
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
from paths import DATA_DIR
|
from indexer.paths import DATA_DIR
|
||||||
from wiki import get_wiki_titles_and_urls
|
from indexer.wiki import get_wiki_titles_and_urls
|
||||||
|
|
||||||
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
|
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
|
||||||
CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
|
CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
|
|
@ -9,10 +9,10 @@ from spacy.lang.en import English
|
||||||
from starlette.testclient import TestClient
|
from starlette.testclient import TestClient
|
||||||
|
|
||||||
from tinysearchengine import create_app
|
from tinysearchengine import create_app
|
||||||
from fsqueue import ZstdJsonSerializer
|
from indexer.fsqueue import ZstdJsonSerializer
|
||||||
from index import index_titles_urls_and_extracts
|
from indexer.index import index_titles_urls_and_extracts
|
||||||
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||||
|
|
||||||
NUM_DOCUMENTS = 30000
|
NUM_DOCUMENTS = 30000
|
||||||
NUM_PAGES_FOR_STATS = 10
|
NUM_PAGES_FOR_STATS = 10
|
0
indexer/domains/__init__.py
Normal file
0
indexer/domains/__init__.py
Normal file
|
@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
|
||||||
import bs4
|
import bs4
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||||
from paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||||
|
|
||||||
NUM_PROCESSES = 10
|
NUM_PROCESSES = 10
|
||||||
|
|
|
@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
|
||||||
import csv
|
import csv
|
||||||
import gzip
|
import gzip
|
||||||
|
|
||||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||||
from paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
||||||
|
|
||||||
BATCH_SIZE = 250
|
BATCH_SIZE = 250
|
||||||
|
|
|
@ -7,9 +7,9 @@ from urllib.parse import quote
|
||||||
|
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from index import index_titles_urls_and_extracts
|
from indexer.index import index_titles_urls_and_extracts
|
||||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||||
|
|
||||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||||
TITLE_START = '<title>Wikipedia: '
|
TITLE_START = '<title>Wikipedia: '
|
Loading…
Reference in a new issue