-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add deployment scripts and documentation (#10)
Signed-off-by: Apekshit Sharma <[email protected]>
- Loading branch information
1 parent
0aba949
commit 4ea98c4
Showing
7 changed files
with
261 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# Deployment | ||
|
||
## Requirements: | ||
1. BigQuery tables : transactions, errors, dedupe_state | ||
1. PubSub topic for transactions | ||
1. GCS bucket : Used for dataflow templates, staging and as temp location | ||
1. ETL Pipeline from PubSub to BigQuery: | ||
1. PubSub subscription | ||
1. Service account with following roles: BigQuery Data Editor, Dataflow Worker, Pub/Sub Subscriber, and Storage | ||
Object Admin | ||
1. Deduplication Task | ||
1. Service account with following roles: BigQuery Data Editor, BigQuery Job User, Monitoring Metric | ||
Writer | ||
1. Mirror Importer | ||
1. Service account with following roles: PubSub Publisher | ||
|
||
Resource creation can be automated using [setup-gcp-resources.sh](../scripts/setup-gcp-resources.sh). | ||
[Google Cloud SDK](https://cloud.google.com/sdk/docs) is required to run the script. | ||
|
||
## Steps | ||
|
||
1. Deploy ETL pipeline | ||
|
||
Use [deploy-etl-pipeline.sh](../scripts/deploy-etl-pipeline.sh) script to deploy the etl pipeline to GCP Dataflow. | ||
|
||
1. Deploy Deduplication task | ||
|
||
TODO | ||
|
||
1. Deploy Hedera Mirror Node Importer to publish transactions to the pubsub topic. See | ||
Mirror Nodes [installation](https://github.com/hashgraph/hedera-mirror-node/blob/master/docs/installation.md) and | ||
[configuration](https://github.com/hashgraph/hedera-mirror-node/blob/master/docs/configuration.md#importer) for more | ||
details. | ||
|
||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
#!/usr/bin/env bash | ||
|
||
if [[ `which gcloud` == "" ]]; then | ||
echo "Couldn't find 'gcloud' cli. Make sure Cloud SDK is installed. https://cloud.google.com/sdk/docs#install_the_latest_cloud_sdk_version" | ||
exit 1 | ||
fi | ||
|
||
if [[ "${PROJECT_ID}" == "" ]]; then | ||
echo "PROJECT_ID is not set" | ||
exit 1 | ||
fi | ||
|
||
if [[ "${DEPLOYMENT_NAME}" == "" ]]; then | ||
echo "DEPLOYMENT_NAME is not set. It is needed to name GCP resources." | ||
exit 1 | ||
fi | ||
|
||
NAME=${DEPLOYMENT_NAME} | ||
|
||
: ${KEYS_DIR:=`pwd`/${NAME}-keys} | ||
|
||
: ${BUCKET_NAME:=${NAME}-hedera-etl} | ||
|
||
: ${PUBSUB_TOPIC_NAME:=${NAME}-transactions-topic} | ||
: ${PUBSUB_SUBSCRIPTION_ETL_BIGQUERY:=${NAME}-etl-bigquery} | ||
|
||
: ${BQ_DATASET:=${NAME}} | ||
: ${BQ_TRANSACTIONS_TABLE:=${PROJECT_ID}:${BQ_DATASET}.transactions} | ||
: ${BQ_ERRORS_TABLE:=${PROJECT_ID}:${BQ_DATASET}.errors} | ||
: ${BQ_DEDUPE_STATE_TABLE:=${PROJECT_ID}:${BQ_DATASET}.dedupe_state} | ||
|
||
: ${SA_ETL_BIGQUERY:=${NAME}-etl-bigquery} | ||
: ${SA_DEDUPLICATION:=${NAME}-deduplication-bigquery} | ||
: ${SA_IMPORTER:=${NAME}-importer} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Creates BigQuery tables needed by Dataflow and Deduplication jobs. | ||
# | ||
# Usage: PROJECT_ID=... DEPLOYMENT_NAME=<testnet/mainnet/etc> create-tables.sh | ||
# Optionally, table names can be set using BQ_TRANSACTIONS_TABLE, BQ_ERRORS_TABLE, BQ_DEDUPE_STATE_TABLE | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | ||
. ${SCRIPT_DIR}/common.sh | ||
|
||
bq mk \ | ||
--table \ | ||
--description "Hedera network transactions" \ | ||
--time_partitioning_field consensusTimestampTruncated \ | ||
--time_partitioning_type DAY \ | ||
--clustering_fields transactionType \ | ||
${BQ_TRANSACTIONS_TABLE} \ | ||
${SCRIPT_DIR}/../hedera-etl-dataflow/src/main/resources/schema.json | ||
|
||
bq mk \ | ||
--table \ | ||
--description "Hedera ETL Errors" \ | ||
${BQ_ERRORS_TABLE} \ | ||
${SCRIPT_DIR}/../hedera-etl-dataflow/src/main/resources/errors_schema.json | ||
|
||
bq mk \ | ||
--table \ | ||
--description "BigQuery deduplication job state" \ | ||
${BQ_DEDUPE_STATE_TABLE} \ | ||
${SCRIPT_DIR}/../hedera-dedupe-bigquery/state-schema.json | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Deploys ETL-Pipeline to GCP Dataflow | ||
# This script assumes that resources have been allocated using setup-gcp-resources.sh | ||
# Usage: PROJECT_ID=... DEPLOYMENT_NAME=<testnet/mainnet/etc> KEYS_DIR=... deploy-etl-pipeline.sh | ||
|
||
if [[ "${KEYS_DIR}" == "" ]]; then | ||
echo "KEYS_DIR is not set" | ||
exit 1 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | ||
. ${SCRIPT_DIR}/common.sh | ||
|
||
JSON_KEY=${KEYS_DIR}/${SA_ETL_BIGQUERY}.json | ||
if [[ ! -f "${JSON_KEY}" ]]; then | ||
echo "Couldn't find ${JSON_KEY}. Makes sure KEYS_DIR is correctly set up." | ||
exit 1 | ||
fi | ||
export GOOGLE_APPLICATION_CREDENTIALS=${JSON_KEY} | ||
|
||
cd ${SCRIPT_DIR}/../hedera-etl-dataflow | ||
|
||
echo "Building and uploading pipeline templates to GCS" | ||
|
||
PIPELINE_FOLDER=gs://${BUCKET_NAME}/pipelines/etl-bigquery | ||
mvn clean compile exec:java \ | ||
-Dexec.args=" \ | ||
--project=${PROJECT_ID} \ | ||
--stagingLocation=${PIPELINE_FOLDER}/staging \ | ||
--tempLocation=${PIPELINE_FOLDER}/temp \ | ||
--templateLocation=${PIPELINE_FOLDER}/template \ | ||
--runner=DataflowRunner" | ||
|
||
echo "Staring Dataflow job" | ||
|
||
SUBSCRIPTION="projects/${PROJECT_ID}/subscriptions/${PUBSUB_SUBSCRIPTION_ETL_BIGQUERY}" | ||
gcloud dataflow jobs run etl-pipeline-`date +"%Y%m%d-%H%M%S%z"` \ | ||
--gcs-location=${PIPELINE_FOLDER}/template \ | ||
--service-account-email=${SA_ETL_BIGQUERY}@${PROJECT_ID}.iam.gserviceaccount.com \ | ||
--parameters "inputSubscription=${SUBSCRIPTION},outputTransactionsTable=${BQ_TRANSACTIONS_TABLE},outputErrorsTable=${BQ_ERRORS_TABLE}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
#!/usr/bin/env bash | ||
# Creates GCP resources like Service Accounts, GCS buckets, BigQuery datasets and tables, etc for various components of | ||
# hedera-etl | ||
# For more details, refer to docs/deployment.md. | ||
# | ||
# Usage: PROJECT_ID=... DEPLOYMENT_NAME=<testnet/mainnet/etc> setup-gcp-resources.sh | ||
# Optionally, KEYS_DIR can be set to specify the directory where service accounts' keys would be downloaded. Default to | ||
# './${DEPLOYMENT_NAME}-keys' | ||
|
||
set -e | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" | ||
. ${SCRIPT_DIR}/common.sh | ||
|
||
#### Functions #### | ||
create_service_account_with_roles() | ||
{ | ||
local sa_name=$1 | ||
local roles=$2 | ||
local description="$3" | ||
|
||
# Create service account | ||
gcloud iam service-accounts create ${sa_name} \ | ||
--project=${PROJECT_ID} \ | ||
--description="${description}" | ||
|
||
# Assign roles to the service account | ||
for role in ${roles}; do | ||
gcloud projects add-iam-policy-binding ${PROJECT_ID} \ | ||
--member serviceAccount:${sa_name}@${PROJECT_ID}.iam.gserviceaccount.com \ | ||
--role ${role} > /dev/null # Project's complete IAM policy is dumped to console otherwise | ||
echo "Assigned role ${role} to ${sa_name}" | ||
done | ||
} | ||
|
||
create_service_account_key() | ||
{ | ||
local sa_name=$1 | ||
local key_filename=${KEYS_DIR}/${sa_name}.json | ||
# Download service account's key | ||
gcloud iam service-accounts keys create ${key_filename} \ | ||
--iam-account=${sa_name}@${PROJECT_ID}.iam.gserviceaccount.com | ||
} | ||
|
||
#### Base resources #### | ||
mkdir -p ${KEYS_DIR} | ||
|
||
# Create BigQuery dataset and tables | ||
bq mk --project_id=${PROJECT_ID} ${NAME} | ||
DATASET_NAME=${BQ_DATASET} ${SCRIPT_DIR}/create-tables.sh | ||
|
||
# Create PubSub topic for transactions | ||
gcloud pubsub topics create ${PUBSUB_TOPIC_NAME} --project=${PROJECT_ID} | ||
|
||
# Create GCS bucket for dataflow pipelines | ||
gsutil mb -p ${PROJECT_ID} gs://${BUCKET_NAME} | ||
|
||
#### Resources for ETL to BigQuery #### | ||
gcloud pubsub subscriptions create ${PUBSUB_SUBSCRIPTION_ETL_BIGQUERY} \ | ||
--project=${PROJECT_ID} \ | ||
--topic=${PUBSUB_TOPIC_NAME} \ | ||
--message-retention-duration=7d \ | ||
--expiration-period=never | ||
|
||
create_service_account_with_roles \ | ||
${SA_ETL_BIGQUERY} \ | ||
"roles/bigquery.dataEditor roles/dataflow.worker roles/pubsub.subscriber roles/storage.admin" \ | ||
"For pubsub --> bigquery dataflow controller" | ||
|
||
create_service_account_key ${SA_ETL_BIGQUERY} | ||
|
||
#### Resources for Deduplication task #### | ||
create_service_account_with_roles \ | ||
${SA_DEDUPLICATION} \ | ||
"roles/bigquery.dataEditor roles/bigquery.jobUser roles/monitoring.metricWriter" \ | ||
"For BigQuery deduplication task" | ||
|
||
create_service_account_key ${SA_DEDUPLICATION} | ||
|
||
#### Resources for Hedera Mirror Importer #### | ||
create_service_account_with_roles \ | ||
${SA_IMPORTER} "roles/pubsub.publisher" "For hedera mirror node importer (publishes to PubSub)" | ||
|
||
create_service_account_key ${SA_IMPORTER} |