Move indexer code to a separate package

This commit is contained in:
Daoud Clarke 2021-12-26 08:55:09 +00:00
parent 8cfb8b7a44
commit baede32298
22 changed files with 12 additions and 12 deletions

View file

@ -1,5 +1,5 @@
from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document from tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
from paths import INDEX_PATH from indexer.paths import INDEX_PATH
def get_items(): def get_items():

View file

@ -5,8 +5,8 @@ import os
from itertools import islice from itertools import islice
from urllib.parse import quote from urllib.parse import quote
from paths import DATA_DIR from indexer.paths import DATA_DIR
from wiki import get_wiki_titles_and_urls from indexer.wiki import get_wiki_titles_and_urls
URL_TEMPLATE = "http://localhost:8000/complete?q={}" URL_TEMPLATE = "http://localhost:8000/complete?q={}"
CURL_FILE = os.path.join(DATA_DIR, "urls.curl") CURL_FILE = os.path.join(DATA_DIR, "urls.curl")

View file

@ -9,10 +9,10 @@ from spacy.lang.en import English
from starlette.testclient import TestClient from starlette.testclient import TestClient
from tinysearchengine import create_app from tinysearchengine import create_app
from fsqueue import ZstdJsonSerializer from indexer.fsqueue import ZstdJsonSerializer
from index import index_titles_urls_and_extracts from indexer.index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document from tinysearchengine.indexer import TinyIndex, TinyIndexer, Document
from paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH from indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH
NUM_DOCUMENTS = 30000 NUM_DOCUMENTS = 30000
NUM_PAGES_FOR_STATS = 10 NUM_PAGES_FOR_STATS = 10

View file

View file

@ -8,8 +8,8 @@ from urllib.parse import urlsplit, urlunsplit
import bs4 import bs4
import requests import requests
from fsqueue import FSQueue, ZstdJsonSerializer from indexer.fsqueue import FSQueue, ZstdJsonSerializer
from paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME from indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME
NUM_PROCESSES = 10 NUM_PROCESSES = 10

View file

@ -4,8 +4,8 @@ Add domains to the queue to be retrieved
import csv import csv
import gzip import gzip
from fsqueue import FSQueue, ZstdJsonSerializer from indexer.fsqueue import FSQueue, ZstdJsonSerializer
from paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR from indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR
BATCH_SIZE = 250 BATCH_SIZE = 250

View file

@ -7,9 +7,9 @@ from urllib.parse import quote
from spacy.lang.en import English from spacy.lang.en import English
from index import index_titles_urls_and_extracts from indexer.index import index_titles_urls_and_extracts
from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE from tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE
from paths import WIKI_TITLES_PATH, INDEX_PATH from indexer.paths import WIKI_TITLES_PATH, INDEX_PATH
TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text']
TITLE_START = '<title>Wikipedia: ' TITLE_START = '<title>Wikipedia: '