paths.py 836 B

123456789101112131415161718192021
  1. import os
  2. from pathlib import Path
  3. HOME = os.getenv('HOME')
  4. DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
  5. COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
  6. HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
  7. CRAWL_PREFIX = 'crawl_'
  8. CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
  9. TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
  10. TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv')
  11. WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
  12. WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
  13. DOMAINS_QUEUE_NAME = 'domains-queue-fs'
  14. DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
  15. DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
  16. INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'