Skip to content

Commit

Permalink
Merge pull request #1629 from gravitational/sasha/deploy2
Browse files Browse the repository at this point in the history
Improvements for AWS support deployments
  • Loading branch information
klizhentas authored Jan 24, 2018
2 parents 4986d02 + 6138f2c commit f60fba4
Show file tree
Hide file tree
Showing 18 changed files with 375 additions and 28 deletions.
6 changes: 6 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,9 @@

* `systemd` : Service file for systemd
* `upstart` : Start-up script for [upstart](https://en.wikipedia.org/wiki/Upstart)

## AWS examples

* `aws` : Examples of provisioning Teleport on AWS.


59 changes: 59 additions & 0 deletions examples/aws/README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,61 @@
# AWS provisioning examples

## Terraform provisioning example

Terraform specifies example provisioning script
for Teleport auth, proxy and nodes in HA mode.

Use these examples as possible deployment patterns suggested
by Teleport developers.

Scripts set up letsencrypt certificates using DNS-01 challenge.
This means users have to control DNS zone via route53.

Teleport join tokens are distributed using SSM parameter store,
and certificates are distributed using encrypted S3 bucket.

There are a couple of tricks using DynamoDB locking to make sure
there is only one auth server node rotating join token at a time,
but those could be easilly replaced and are not critical for performance.

Important bits are that auth servers and proxes are not running as root
and are secured exposing absolute minimum of the ports to the other parts.

```bash
# Set variables for Terraform

# This region should support EFS
export TF_VAR_region="us-west-2"

# Cluster name is a unique cluster name to use, better to use FQDN, e.g. cluster.example.com
export TF_VAR_cluster_name=cluster.example.com

# Teleport version to install, e.g. 2.4.0
export TF_VAR_teleport_version="2.5.0-alpha.5"

# AWS SSH key name to provision in installed instances, should be available in the region
export TF_VAR_key_name="example"

# Full absolute path to the license file for Teleport enterprise or pro
export TF_VAR_license_path="/path/to/license"

# AMI name to use, could be public or private
export TF_VAR_ami_name="debian-stretch-hvm-x86_64-gp2-2018-01-06-16218-572488bb-fc09-4638-8628-e1e1d26436f4-ami-628ad918.4"

# Route 53 zone to use, should be the zone registered in AWS,
# e.g. example.com
export TF_VAR_route53_zone="example.com"

# Subdomain to set up in the zone above, e.g. cluster.example.com
# this will be used for internet access for users connecting to teleport proxy
export TF_VAR_route53_domain="cluster.example.com"

# Bucket name to store encrypted letsencrypt certificates.
export TF_VAR_s3_bucket_name="teleport.example.com"

# Email of your support org, uset for letsencrypt cert registration process.
export TF_VAR_email="[email protected]"

# plan
make plan
```
2 changes: 1 addition & 1 deletion examples/aws/terraform/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ TF_VAR_teleport_version ?=
TF_VAR_key_name ?=
# Full absolute path to the license file for Teleport enterprise or pro
TF_VAR_license_path ?=
# AMI name to use, could be public or private, p
# AMI name to use, could be public or private
TF_VAR_ami_name ?=
# Route 53 zone to use, should be the zone registered in AWS,
# e.g. example.com
Expand Down
48 changes: 46 additions & 2 deletions examples/aws/terraform/auth-user-data.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,40 @@ systemctl enable teleport
systemctl start teleport


# Script that makes sure that only one auth server processes
# requests at a time, by using dynamodb-backed locking.
# The lock is implemented as item in DynamoDB table: {"Lock": "lock1", "Expires": "time", "Process": "server1"}
# The auth server node either renews the lease if lock "Process" holds the server id as owner of the lock
# or grabs the lock in case if expires column indicates that the lease has not been renewed after timeout.
# This pattern can be implemented in many different ways, e.g. using ASG group of 1 as a separate process
# or in Kubernetes as a deployment of scale 1.
cat >/usr/local/bin/teleport-lock <<EOF
#!/bin/bash
set -x
LOCK="/teleport/${cluster_name}"
NOW=\$$(date +%s)
TTL=\$$((\$$NOW+3660))
PROCESS="$${LOCAL_HOSTNAME}"
echo locking \$$PROCESS for \$$TTL
# Either renew the lease if agent still holds it, or grab the lease if it's expired
aws dynamodb put-item \
--region $${EC2_REGION} \
--table-name ${locks_table_name}\
--item "{\"Lock\": {\"S\": \"/auth/servers\"}, \"Expires\": {\"S\": \"\$$TTL\"}, \"Process\": {\"S\": \"\$$PROCESS\"}}" \
--condition-expression="(attribute_not_exists(Expires) OR Expires <= :timestamp) OR Process = :process"\
--expression-attribute-values "{\":timestamp\":{\"S\":\"\$$NOW\"}, \":process\":{\"S\":\"\$$PROCESS\"}}"
if [ \$$? -eq 0 ]; then
echo "Renewed or locked the lease for \$$PROCESS until $(date -d @\$$TTL)"
else
echo "Could get renew lease, locked by other process"
exit 255
fi
EOF
chmod 755 /usr/local/bin/teleport-lock

# Install a service that rotates teleport join tokens.
# Teleport join tokens are temporary authentication tokens
# letting nodes and proxies to join to the cluster. Notice that timer
Expand All @@ -115,13 +149,21 @@ cat >/usr/local/bin/teleport-ssm-publish-tokens <<EOF
set -e
set -o pipefail
# Proxy token authenticates proxies joining the cluster
PROXY_TOKEN=\$$(uuid)
tctl nodes add --roles=proxy --ttl=2h --token=\$${PROXY_TOKEN}
tctl nodes add --roles=proxy --ttl=4h --token=\$${PROXY_TOKEN}
aws ssm put-parameter --name /teleport/$${CLUSTER_NAME}/tokens/proxy --region $${EC2_REGION} --type="SecureString" --value="\$${PROXY_TOKEN}" --overwrite
# Node token authenticates nodes joining the cluster
NODE_TOKEN=\$$(uuid)
tctl nodes add --roles=node --ttl=2h --token=\$${NODE_TOKEN}
tctl nodes add --roles=node --ttl=4h --token=\$${NODE_TOKEN}
aws ssm put-parameter --name /teleport/$${CLUSTER_NAME}/tokens/node --region $${EC2_REGION} --type="SecureString" --value="\$${NODE_TOKEN}" --overwrite
# Export CA certificate to SSM parameter store
# so nodes and proxies can check the identity of the auth server they are connecting to
CERT=\$$(tctl auth export --type=tls)
aws ssm put-parameter --name /teleport/$${CLUSTER_NAME}/ca --region $${EC2_REGION} --type="String" --value="\$${CERT}" --overwrite
EOF
chmod 755 /usr/local/bin/teleport-ssm-publish-tokens

Expand All @@ -132,6 +174,7 @@ Description=Service rotating teleport tokens
[Service]
Type=oneshot
ExecStartPre=/usr/local/bin/teleport-lock
ExecStart=/usr/local/bin/teleport-ssm-publish-tokens
EOF

Expand Down Expand Up @@ -190,6 +233,7 @@ Description=Service getting teleport certificates
[Service]
Type=oneshot
ExecStartPre=/usr/local/bin/teleport-lock
ExecStart=/usr/local/bin/teleport-get-cert
EOF

Expand Down
3 changes: 2 additions & 1 deletion examples/aws/terraform/auth_asg.tf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
// write certificates to encrypted S3 bucket.
resource "aws_autoscaling_group" "auth" {
name = "${var.cluster_name}-auth"
max_size = 2
max_size = 5
min_size = 1
health_check_grace_period = 300
health_check_type = "EC2"
Expand Down Expand Up @@ -33,6 +33,7 @@ data "template_file" "auth_user_data" {
template = "${file("auth-user-data.tpl")}"

vars {
locks_table_name = "${aws_dynamodb_table.locks.name}"
cluster_name = "${var.cluster_name}"
efs_mount_point = "${aws_efs_file_system.auth.id}.efs.${var.region}.amazonaws.com"
teleport_version = "${var.teleport_version}"
Expand Down
22 changes: 21 additions & 1 deletion examples/aws/terraform/auth_iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ resource "aws_iam_role_policy" "auth_dynamo" {
"Version": "2012-10-17",
"Statement": [
{
"Sid": "AllAPIActionsOnBooks",
"Sid": "AllActionsOnTeleportDB",
"Effect": "Allow",
"Action": "dynamodb:*",
"Resource": "arn:aws:dynamodb:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:table/${aws_dynamodb_table.teleport.name}"
Expand All @@ -78,6 +78,26 @@ resource "aws_iam_role_policy" "auth_dynamo" {
EOF
}

// Allow auth servers to update locks
resource "aws_iam_role_policy" "auth_locks" {
name = "${var.cluster_name}-auth-locks"
role = "${aws_iam_role.auth.id}"

policy = <<EOF
{
"Version": "2012-10-17",
"Statement": [
{
"Sid": "AllActionsOnLocks",
"Effect": "Allow",
"Action": "dynamodb:*",
"Resource": "arn:aws:dynamodb:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:table/${aws_dynamodb_table.locks.name}"
}
]
}
EOF
}

// S3 is used for letsencrypt, auth servers request certificates from letsencrypt
// and publish to S3 encrypted bucket. SSM is not used, because certificates and private keys
// are too big for SSM.
Expand Down
4 changes: 4 additions & 0 deletions examples/aws/terraform/dynamo.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,10 @@ resource "aws_dynamodb_table" "teleport" {
hash_key = "HashKey"
range_key = "FullPath"

lifecycle {
ignore_changes = ["read_capacity", "write_capacity"]
}

attribute {
name = "HashKey"
type = "S"
Expand Down
30 changes: 30 additions & 0 deletions examples/aws/terraform/locks.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
// Locks is a dynamodb table used as a distributed lock
// to make sure there is only one auth server doing
// letsencrypt certificate renewal, this is not critical for teleport
// and is purely for demonstration purposes
resource "aws_dynamodb_table" "locks" {
name = "${var.cluster_name}-locks"
read_capacity = 10
write_capacity = 10
hash_key = "Lock"
range_key = "Process"

attribute {
name = "Lock"
type = "S"
}

attribute {
name = "Process"
type = "S"
}

ttl {
attribute_name = "Expires"
enabled = true
}

tags {
TeleportCluster = "${var.cluster_name}"
}
}
5 changes: 4 additions & 1 deletion examples/aws/terraform/node-user-data.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,11 @@ cat >/usr/local/bin/teleport-ssm-get-token <<EOF
set -e
set -o pipefail
aws ssm get-parameter --with-decryption --name /teleport/${cluster_name}/tokens/node --region ${region} --query Parameter.Value --output text | xargs echo -n > /var/lib/teleport/token
# Fetch token published by Auth server to SSM parameter store to join the cluster
aws ssm get-parameter --with-decryption --name /teleport/${cluster_name}/tokens/node --region ${region} --query Parameter.Value --output text > /var/lib/teleport/token
# Fetch Auth server CA certificate to validate the identity of the auth server
aws ssm get-parameter --name /teleport/${cluster_name}/ca --region=${region} --query=Parameter.Value --output text > /var/lib/teleport/ca.cert
EOF
chmod 755 /usr/local/bin/teleport-ssm-get-token

Expand Down
12 changes: 10 additions & 2 deletions examples/aws/terraform/node_iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ EOF
// Note that nodes are only allowed to read node SSM path.
resource "aws_iam_instance_profile" "node" {
name = "${var.cluster_name}-node"
role = "${aws_iam_role.auth.name}"
role = "${aws_iam_role.node.name}"
depends_on = ["aws_iam_role_policy.node_ssm"]
}

Expand All @@ -36,13 +36,21 @@ resource "aws_iam_role_policy" "node_ssm" {
{
"Effect": "Allow",
"Action": [
"ssm:DescribeParameters",
"ssm:GetParameters",
"ssm:GetParametersByPath",
"ssm:GetParameter"
],
"Resource": "arn:aws:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter/teleport/${var.cluster_name}/tokens/node"
},
{
"Effect": "Allow",
"Action": [
"ssm:GetParameters",
"ssm:GetParametersByPath",
"ssm:GetParameter"
],
"Resource": "arn:aws:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter/teleport/${var.cluster_name}/ca"
},
{
"Effect":"Allow",
"Action":[
Expand Down
1 change: 1 addition & 0 deletions examples/aws/terraform/provider.tf
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ variable "aws_max_retries" {

provider "aws" {
version = "~> 1.7"
region = "${var.region}"
}
6 changes: 5 additions & 1 deletion examples/aws/terraform/proxy-user-data.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,11 @@ cat >/usr/local/bin/teleport-ssm-get-token <<EOF
set -e
set -o pipefail
aws ssm get-parameter --with-decryption --name /teleport/${cluster_name}/tokens/proxy --region ${region} --query Parameter.Value --output text | xargs echo -n > /var/lib/teleport/token
# Fetch token published by Auth server to SSM parameter store to join the cluster
aws ssm get-parameter --with-decryption --name /teleport/${cluster_name}/tokens/proxy --region ${region} --query Parameter.Value --output text > /var/lib/teleport/token
# Fetch Auth server CA certificate to validate the identity of the auth server
aws ssm get-parameter --name /teleport/${cluster_name}/ca --region=${region} --query=Parameter.Value --output text > /var/lib/teleport/ca.cert
EOF
chmod 755 /usr/local/bin/teleport-ssm-get-token
Expand Down
2 changes: 1 addition & 1 deletion examples/aws/terraform/proxy_asg.tf
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
// accepting traffic from the internet.
resource "aws_autoscaling_group" "proxy" {
name = "${var.cluster_name}-proxy"
max_size = 2
max_size = 5
min_size = 1
health_check_grace_period = 300
health_check_type = "EC2"
Expand Down
12 changes: 10 additions & 2 deletions examples/aws/terraform/proxy_iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ EOF
// only on auth servers.
resource "aws_iam_instance_profile" "proxy" {
name = "${var.cluster_name}-proxy"
role = "${aws_iam_role.auth.name}"
role = "${aws_iam_role.proxy.name}"
depends_on = ["aws_iam_role_policy.proxy_ssm"]
}

Expand Down Expand Up @@ -64,13 +64,21 @@ resource "aws_iam_role_policy" "proxy_ssm" {
{
"Effect": "Allow",
"Action": [
"ssm:DescribeParameters",
"ssm:GetParameters",
"ssm:GetParametersByPath",
"ssm:GetParameter"
],
"Resource": "arn:aws:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter/teleport/${var.cluster_name}/tokens/proxy"
},
{
"Effect": "Allow",
"Action": [
"ssm:GetParameters",
"ssm:GetParametersByPath",
"ssm:GetParameter"
],
"Resource": "arn:aws:ssm:${data.aws_region.current.name}:${data.aws_caller_identity.current.account_id}:parameter/teleport/${var.cluster_name}/ca"
},
{
"Effect":"Allow",
"Action":[
Expand Down
4 changes: 0 additions & 4 deletions examples/aws/terraform/vars.tf
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
provider "aws" {
region = "${var.region}"
}

// Region is AWS region, the region should support EFS
variable "region" {
type = "string"
Expand Down
Loading

0 comments on commit f60fba4

Please sign in to comment.