Merge pull request #61 from milovanderlinden/issue-60-consistent-use-of-env-vars
Fix issue #60
This commit is contained in:
commit
e1e1b0057b
5 changed files with 50 additions and 34 deletions
|
@ -1,8 +1,6 @@
|
|||
import gzip
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone, timedelta, date
|
||||
from typing import Union
|
||||
|
@ -18,26 +16,24 @@ from mwmbl.crawler.urls import URLDatabase, FoundURL, URLStatus
|
|||
from mwmbl.database import Database
|
||||
from mwmbl.hn_top_domains_filtered import DOMAINS
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, BatchInfo, BatchStatus
|
||||
from mwmbl.settings import (
|
||||
ENDPOINT_URL,
|
||||
KEY_ID,
|
||||
APPLICATION_KEY,
|
||||
BUCKET_NAME,
|
||||
MAX_BATCH_SIZE,
|
||||
USER_ID_LENGTH,
|
||||
VERSION,
|
||||
PUBLIC_URL_PREFIX,
|
||||
UNKNOWN_DOMAIN_MULTIPLIER,
|
||||
SCORE_FOR_SAME_DOMAIN,
|
||||
SCORE_FOR_DIFFERENT_DOMAIN,
|
||||
SCORE_FOR_ROOT_PATH,
|
||||
PUBLIC_USER_ID_LENGTH,
|
||||
FILE_NAME_SUFFIX,
|
||||
DATE_REGEX)
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
|
||||
APPLICATION_KEY = os.environ['MWMBL_APPLICATION_KEY']
|
||||
KEY_ID = os.environ['MWMBL_KEY_ID']
|
||||
ENDPOINT_URL = 'https://s3.us-west-004.backblazeb2.com'
|
||||
BUCKET_NAME = 'mwmbl-crawl'
|
||||
MAX_BATCH_SIZE = 100
|
||||
USER_ID_LENGTH = 36
|
||||
PUBLIC_USER_ID_LENGTH = 64
|
||||
VERSION = 'v1'
|
||||
DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
|
||||
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
|
||||
FILE_NAME_SUFFIX = '.json.gz'
|
||||
|
||||
SCORE_FOR_ROOT_PATH = 0.1
|
||||
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
||||
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
||||
|
||||
|
||||
router = APIRouter(prefix="/crawler", tags=["crawler"])
|
||||
|
||||
|
||||
|
@ -286,4 +282,4 @@ def queue_batch(batch: HashedBatch):
|
|||
for item in batch.items if item.content is not None]
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.queue_documents(documents)
|
||||
index_db.queue_documents(documents)
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import os
|
||||
|
||||
from psycopg2 import connect
|
||||
|
||||
from mwmbl.settings import DATABASE_URL
|
||||
|
||||
|
||||
class Database:
|
||||
def __init__(self):
|
||||
self.connection = None
|
||||
|
||||
def __enter__(self):
|
||||
self.connection = connect(os.environ["DATABASE_URL"])
|
||||
self.connection = connect(DATABASE_URL)
|
||||
self.connection.set_session(autocommit=True)
|
||||
return self
|
||||
|
||||
|
|
|
@ -9,14 +9,6 @@ from pathlib import Path
|
|||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
|
||||
TOP_DOMAINS_PATH = '../hn_top_domains_filtered.py'
|
||||
|
||||
MIN_COUNT = 10
|
||||
PROBABILITY_THRESHOLD = 0.8
|
||||
|
||||
|
||||
def get_top_domains():
|
||||
data = pd.read_csv(ALL_DOMAINS_PATH, index_col='domain')
|
||||
data = data[data.index.notnull()]
|
||||
|
|
|
@ -1,7 +1,6 @@
|
|||
from datetime import date, timedelta
|
||||
|
||||
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_url, \
|
||||
get_batches_for_date, get_user_id_hash_from_url
|
||||
from mwmbl.crawler.app import get_batches_for_date, get_user_id_hash_from_url
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import BatchInfo, BatchStatus, IndexDatabase
|
||||
|
||||
|
|
29
mwmbl/settings.py
Normal file
29
mwmbl/settings.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
|
||||
TOP_DOMAINS_PATH = '../hn_top_domains_filtered.py'
|
||||
|
||||
MIN_COUNT = 10
|
||||
PROBABILITY_THRESHOLD = 0.8
|
||||
DATABASE_URL = os.environ.get("DATABASE_URL", "postgres://username:password@localhost/dbname")
|
||||
|
||||
APPLICATION_KEY = os.environ.get('MWMBL_APPLICATION_KEY', 'SECRETAPPLICATIONKEY')
|
||||
KEY_ID = os.environ.get('MWMBL_KEY_ID', 'SECRETKEYID')
|
||||
ENDPOINT_URL = os.environ.get('MWMBL_ENDPOINT_URL', 'https://s3.us-west-004.backblazeb2.com')
|
||||
BUCKET_NAME = os.environ.get('MWMBL_BUCKET_NAME', 'mwmbl-crawl')
|
||||
MAX_BATCH_SIZE = 100
|
||||
USER_ID_LENGTH = 36
|
||||
PUBLIC_USER_ID_LENGTH = 64
|
||||
VERSION = 'v1'
|
||||
DATE_REGEX = re.compile(r'\d{4}-\d{2}-\d{2}')
|
||||
PUBLIC_URL_PREFIX = f'https://f004.backblazeb2.com/file/{BUCKET_NAME}/'
|
||||
FILE_NAME_SUFFIX = '.json.gz'
|
||||
|
||||
SCORE_FOR_ROOT_PATH = 0.1
|
||||
SCORE_FOR_DIFFERENT_DOMAIN = 1.0
|
||||
SCORE_FOR_SAME_DOMAIN = 0.01
|
||||
UNKNOWN_DOMAIN_MULTIPLIER = 0.001
|
Loading…
Reference in a new issue