Add EMR deploy scripts
This commit is contained in:
parent
14817d7657
commit
a173db319b
4 changed files with 79 additions and 0 deletions
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
|
@ -0,0 +1,2 @@
|
|||
.idea
|
||||
*~
|
15
bootstrap.sh
Normal file
15
bootstrap.sh
Normal file
|
@ -0,0 +1,15 @@
|
|||
#!/bin/bash -xe
|
||||
|
||||
sudo python3 -m pip uninstall numpy -y
|
||||
sudo python3 -m pip uninstall numpy -y
|
||||
sudo python3 -m pip uninstall numpy -y
|
||||
|
||||
sudo python3 -m pip install boto3==1.19.7 botocore==1.22.7 jusText==3.0.0 langdetect==1.0.9 \
|
||||
lxml==4.6.3 numpy==1.21.3 pandas==1.2.5 pyarrow==6.0.0 spacy==2.3.5 \
|
||||
warcio==1.7.4 zstandard==0.16.0
|
||||
|
||||
sudo python3 -m spacy download en_core_web_sm
|
||||
|
||||
echo "========================"
|
||||
echo "Normal python pip freeze"
|
||||
python3 -m pip freeze
|
20
deploy.sh
Normal file
20
deploy.sh
Normal file
|
@ -0,0 +1,20 @@
|
|||
cat hn-top-domains-filtered.py extract.py > runextract.py
|
||||
|
||||
aws s3 cp runextract.py s3://tinysearch/code/
|
||||
aws s3 cp bootstrap.sh s3://tinysearch/code/
|
||||
|
||||
|
||||
aws emr create-cluster \
|
||||
--applications Name=Spark Name=Zeppelin \
|
||||
--ec2-attributes '{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-03c33360c68f73a48"}' \
|
||||
--service-role EMR_DefaultRole \
|
||||
--enable-debugging \
|
||||
--release-label emr-5.33.1 \
|
||||
--log-uri 's3n://tinysearch/pyspark-logs/' \
|
||||
--bootstrap-actions '{"Path": "s3://tinysearch/code/bootstrap.sh"}' \
|
||||
--steps '[{"Args":["spark-submit","--deploy-mode","cluster","s3n://tinysearch/code/runextract.py"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"Spark application"}]' \
|
||||
--name 'TinySearch' \
|
||||
--instance-groups '[{"InstanceCount":5,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"m4.large","Name":"Core Instance Group"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"m4.large","Name":"Master Instance Group"}]' \
|
||||
--configurations '[{"Classification":"spark","Properties":{}}]' \
|
||||
--scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 \
|
||||
--auto-terminate
|
42
domains.py
Normal file
42
domains.py
Normal file
|
@ -0,0 +1,42 @@
|
|||
"""
|
||||
Extract top domains from BigQuery result.
|
||||
"""
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv'
|
||||
TOP_DOMAINS_PATH = 'hn-top-domains-filtered.py'
|
||||
|
||||
MIN_COUNT = 10
|
||||
PROBABILITY_THRESHOLD = 0.8
|
||||
|
||||
|
||||
def get_top_domains():
|
||||
data = pd.read_csv(ALL_DOMAINS_PATH, index_col='domain')
|
||||
data = data[data.index.notnull()]
|
||||
|
||||
frequent = data[data['total'] >= MIN_COUNT]
|
||||
scores = frequent['mean_score'] * np.log(frequent['total']) ** 2
|
||||
median_score = np.median(scores)
|
||||
print("Median score", median_score)
|
||||
probabilities = scores / (scores + median_score)
|
||||
|
||||
top_probabilities = probabilities[probabilities > PROBABILITY_THRESHOLD]
|
||||
top_probabilities.sort_values(ascending=False, inplace=True)
|
||||
with open(TOP_DOMAINS_PATH, 'w') as output_file:
|
||||
probabilities_str = str(top_probabilities.to_dict()).replace(', ', ',\n')
|
||||
output_file.write("DOMAINS = " + probabilities_str + '\n\n')
|
||||
# json.dump(probabilities.to_dict(), output_file, indent=2)
|
||||
|
||||
# for row in probabilities.iterrows():
|
||||
# output_file.write(json.dumps(row.to_dict()) + '\n')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_top_domains()
|
Loading…
Add table
Reference in a new issue