mwmbl/paths.py

22 lines
827 B
Python
Raw Normal View History

2021-03-13 20:54:15 +00:00
import os
2021-12-13 11:23:01 +00:00
from pathlib import Path
2021-03-13 20:54:15 +00:00
HOME = os.getenv('HOME')
2021-12-13 11:23:01 +00:00
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
2021-03-13 20:54:15 +00:00
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
2021-04-12 17:37:33 +00:00
INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
2021-05-30 20:30:34 +00:00
TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
2021-03-15 22:06:37 +00:00
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
2021-03-23 22:03:48 +00:00
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
2021-04-25 07:55:15 +00:00
2021-12-13 11:23:01 +00:00
2021-05-05 21:16:27 +00:00
DOMAINS_QUEUE_NAME = 'domains-queue-fs'
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
2021-04-25 07:55:15 +00:00
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')