Move indexer code to a separate package
This commit is contained in:
parent
8cfb8b7a44
commit
baede32298
22 changed files with 12 additions and 12 deletions
|
@ -1,5 +1,5 @@
|
|||
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
|
||||
from paths import INDEX_PATH
|
||||
from indexer.paths import INDEX_PATH
|
||||
|
||||
|
||||
def get_items():
|
||||
|
|
|
@ -5,8 +5,8 @@ import os
|
|||
from itertools import islice
|
||||
from urllib.parse import quote
|
||||
|
||||
from paths import DATA_DIR
|
||||
from wiki import get_wiki_titles_and_urls
|
||||
from indexer.paths import DATA_DIR
|
||||
from indexer.wiki import get_wiki_titles_and_urls
|
||||
|
||||
URL_TEMPLATE = "http://localhost:8000/complete?q={}"
|
||||
CURL_FILE = os.path.join(DATA_DIR, "urls.curl")
|
|
@ -9,10 +9,10 @@ from spacy.lang.en import English
|
|||
from starlette.testclient import TestClient
|
||||
|
||||
from tinysearchengine import create_app
|
||||
from fsqueue import ZstdJsonSerializer
|
||||
from index import index_titles_urls_and_extracts
|
||||
from indexer.fsqueue import ZstdJsonSerializer
|
||||
from indexer.index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
|
||||
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
|
||||
|
||||
NUM_DOCUMENTS = 30000
|
||||
NUM_PAGES_FOR_STATS = 10
|
0
indexer/domains/__init__.py
Normal file
0
indexer/domains/__init__.py
Normal file
|
@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
|
|||
import bs4
|
||||
import requests
|
||||
|
||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
|
||||
|
||||
NUM_PROCESSES = 10
|
||||
|
|
@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
|
|||
import csv
|
||||
import gzip
|
||||
|
||||
from fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
||||
from indexer.fsqueue import FSQueue, ZstdJsonSerializer
|
||||
from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
|
||||
|
||||
BATCH_SIZE = 250
|
||||
|
|
@ -7,9 +7,9 @@ from urllib.parse import quote
|
|||
|
||||
from spacy.lang.en import English
|
||||
|
||||
from index import index_titles_urls_and_extracts
|
||||
from indexer.index import index_titles_urls_and_extracts
|
||||
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
|
||||
from paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
|
||||
|
||||
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
|
||||
TITLE_START = '<title>Wikipedia: '
|
Loading…
Reference in a new issue