From a173db319b5f62606d7b7ca7bd53ad59cd65c611 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 5 Dec 2021 21:02:17 +0000 Subject: [PATCH] Add EMR deploy scripts --- .gitignore | 2 ++ bootstrap.sh | 15 +++++++++++++++ deploy.sh | 20 ++++++++++++++++++++ domains.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+) create mode 100644 .gitignore create mode 100644 bootstrap.sh create mode 100644 deploy.sh create mode 100644 domains.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3023c68 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea +*~ diff --git a/bootstrap.sh b/bootstrap.sh new file mode 100644 index 0000000..6186fae --- /dev/null +++ b/bootstrap.sh @@ -0,0 +1,15 @@ +#!/bin/bash -xe + +sudo python3 -m pip uninstall numpy -y +sudo python3 -m pip uninstall numpy -y +sudo python3 -m pip uninstall numpy -y + +sudo python3 -m pip install boto3==1.19.7 botocore==1.22.7 jusText==3.0.0 langdetect==1.0.9 \ + lxml==4.6.3 numpy==1.21.3 pandas==1.2.5 pyarrow==6.0.0 spacy==2.3.5 \ + warcio==1.7.4 zstandard==0.16.0 + +sudo python3 -m spacy download en_core_web_sm + +echo "========================" +echo "Normal python pip freeze" +python3 -m pip freeze diff --git a/deploy.sh b/deploy.sh new file mode 100644 index 0000000..63e8364 --- /dev/null +++ b/deploy.sh @@ -0,0 +1,20 @@ +cat hn-top-domains-filtered.py extract.py > runextract.py + +aws s3 cp runextract.py s3://tinysearch/code/ +aws s3 cp bootstrap.sh s3://tinysearch/code/ + + +aws emr create-cluster \ + --applications Name=Spark Name=Zeppelin \ + --ec2-attributes '{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-03c33360c68f73a48"}' \ + --service-role EMR_DefaultRole \ + --enable-debugging \ + --release-label emr-5.33.1 \ + --log-uri 's3n://tinysearch/pyspark-logs/' \ + --bootstrap-actions '{"Path": "s3://tinysearch/code/bootstrap.sh"}' \ + --steps '[{"Args":["spark-submit","--deploy-mode","cluster","s3n://tinysearch/code/runextract.py"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"Spark application"}]' \ + --name 'TinySearch' \ + --instance-groups '[{"InstanceCount":5,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"m4.large","Name":"Core Instance Group"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"m4.large","Name":"Master Instance Group"}]' \ + --configurations '[{"Classification":"spark","Properties":{}}]' \ + --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 \ + --auto-terminate diff --git a/domains.py b/domains.py new file mode 100644 index 0000000..5cd7294 --- /dev/null +++ b/domains.py @@ -0,0 +1,42 @@ +""" +Extract top domains from BigQuery result. +""" +import json +import os +import sys +from pathlib import Path + +import numpy as np +import pandas as pd + +DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' +ALL_DOMAINS_PATH = DATA_DIR / 'hn-top-domains.csv' +TOP_DOMAINS_PATH = 'hn-top-domains-filtered.py' + +MIN_COUNT = 10 +PROBABILITY_THRESHOLD = 0.8 + + +def get_top_domains(): + data = pd.read_csv(ALL_DOMAINS_PATH, index_col='domain') + data = data[data.index.notnull()] + + frequent = data[data['total'] >= MIN_COUNT] + scores = frequent['mean_score'] * np.log(frequent['total']) ** 2 + median_score = np.median(scores) + print("Median score", median_score) + probabilities = scores / (scores + median_score) + + top_probabilities = probabilities[probabilities > PROBABILITY_THRESHOLD] + top_probabilities.sort_values(ascending=False, inplace=True) + with open(TOP_DOMAINS_PATH, 'w') as output_file: + probabilities_str = str(top_probabilities.to_dict()).replace(', ', ',\n') + output_file.write("DOMAINS = " + probabilities_str + '\n\n') + # json.dump(probabilities.to_dict(), output_file, indent=2) + + # for row in probabilities.iterrows(): + # output_file.write(json.dumps(row.to_dict()) + '\n') + + +if __name__ == '__main__': + get_top_domains()