Skip to content
This repository has been archived by the owner on Jan 12, 2024. It is now read-only.

Commit

Permalink
Merge pull request #50 from Nike-Inc/feature/command_to_roll_cms_inst…
Browse files Browse the repository at this point in the history
…ances

Feature: Command to Reboot EC2 instances
  • Loading branch information
sdford authored Jun 6, 2017
2 parents 1209662 + c30afb4 commit 1343720
Show file tree
Hide file tree
Showing 13 changed files with 529 additions and 36 deletions.
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,4 @@

group=com.nike
artifactId=cerberus-lifecycle-cli
version=1.5.0
version=1.6.0
2 changes: 2 additions & 0 deletions src/main/java/com/nike/cerberus/cli/CerberusRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import com.nike.cerberus.command.cms.CreateCmsClusterCommand;
import com.nike.cerberus.command.cms.CreateCmsConfigCommand;
import com.nike.cerberus.command.cms.UpdateCmsConfigCommand;
import com.nike.cerberus.command.core.RollingRebootWithHealthCheckCommand;
import com.nike.cerberus.command.core.ViewConfigCommand;
import com.nike.cerberus.command.consul.CreateConsulClusterCommand;
import com.nike.cerberus.command.consul.CreateConsulConfigCommand;
Expand Down Expand Up @@ -198,6 +199,7 @@ private void registerAllCommands() {
registerCommand(new RestoreCompleteCerberusDataFromS3BackupCommand());
registerCommand(new ViewConfigCommand());
registerCommand(new UpdateCmsConfigCommand());
registerCommand(new RollingRebootWithHealthCheckCommand());
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@

package com.nike.cerberus.client;

import com.fasterxml.jackson.databind.JsonNode;
import com.nike.vault.client.UrlResolver;
import com.nike.vault.client.VaultAdminClient;
import com.nike.vault.client.VaultClientException;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
import java.util.HashMap;
import java.util.Map;

import static com.nike.cerberus.command.cms.CreateCmsClusterCommand.COMMAND_NAME;
import static com.nike.cerberus.command.cms.UpdateCmsConfigCommand.COMMAND_NAME;

/**
* Command to create the CMS cluster.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
package com.nike.cerberus.command.core;

import com.beust.jcommander.Parameter;
import com.beust.jcommander.Parameters;
import com.nike.cerberus.command.Command;
import com.nike.cerberus.domain.environment.StackName;
import com.nike.cerberus.operation.Operation;
import com.nike.cerberus.operation.core.RollingRebootWithHealthCheckOperation;

import static com.nike.cerberus.command.core.RollingRebootWithHealthCheckCommand.COMMAND_NAME;

/**
* Command to reboot the CMS cluster.
*/
@Parameters(
commandNames = COMMAND_NAME,
commandDescription = "Performs a safe rolling reboot on instances in the given cluster, checking that " +
"the previous instance is healthy before rebooting the next one."
)
public class RollingRebootWithHealthCheckCommand implements Command {

public static final String COMMAND_NAME = "rolling-reboot-with-health-check";

@Parameter(names = {"--stack-name"}, required = true, description = "The stack name to reboot.")
private StackName stackName = StackName.CMS;

public StackName getStackName() {
return stackName;
}

@Override
public String getCommandName() {
return COMMAND_NAME;
}

@Override
public Class<? extends Operation<?>> getOperationClass() {
return RollingRebootWithHealthCheckOperation.class;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,14 @@
import com.beust.jcommander.Parameters;
import com.nike.cerberus.command.Command;
import com.nike.cerberus.operation.Operation;
import com.nike.cerberus.operation.cms.ViewConfigOperation;
import com.nike.cerberus.operation.core.ViewConfigOperation;

import static com.nike.cerberus.command.cms.CreateCmsClusterCommand.COMMAND_NAME;

/**
* Command to create the CMS cluster.
* Command to view configuration files in S3.
*/
@Parameters(commandNames = COMMAND_NAME, commandDescription = "Shows the CMS config.")
@Parameters(commandNames = COMMAND_NAME, commandDescription = "Shows configuration files from S3.")
public class ViewConfigCommand implements Command {

public static final String COMMAND_NAME = "view-config";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
package com.nike.cerberus.operation.core;

import com.amazonaws.services.ec2.model.Filter;
import com.amazonaws.services.ec2.model.Instance;
import com.github.tomaslanger.chalk.Chalk;
import com.google.common.collect.ImmutableMap;
import com.google.inject.Inject;
import com.nike.cerberus.command.core.RollingRebootWithHealthCheckCommand;
import com.nike.cerberus.domain.environment.StackName;
import com.nike.cerberus.operation.Operation;
import com.nike.cerberus.service.AutoScalingService;
import com.nike.cerberus.service.CloudFormationService;
import com.nike.cerberus.service.Ec2Service;
import com.nike.cerberus.store.ConfigStore;
import com.nike.vault.client.http.HttpStatus;
import okhttp3.Call;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.Response;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.net.Proxy;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;

import static com.nike.cerberus.service.CloudFormationService.MIN_INSTANCES_STACK_PARAMETER_KEY;
import static com.nike.cerberus.service.Ec2Service.EC2_ASG_GROUP_NAME_TAG_KEY;
import static com.nike.cerberus.service.Ec2Service.INSTANCE_STATE_FILTER_NAME;
import static com.nike.cerberus.service.Ec2Service.INSTANCE_STATE_RUNNING_FILTER_VALUE;

/**
* Reboots all EC2 instances in the given cluster.
*/
public class RollingRebootWithHealthCheckOperation implements Operation<RollingRebootWithHealthCheckCommand> {

private final Logger logger = LoggerFactory.getLogger(getClass());

private final static ImmutableMap<String, String> HEALTH_CHECK_MAP = ImmutableMap.of(
StackName.CMS.getName(), "http://%s:8080/healthcheck",
StackName.GATEWAY.getName(), "https://%s:443/sys/health"
// TODO: Test that command works with remaining stacks
// StackName.VAULT.getName(), "https://%s:8200/v1/sys/health?standbyok",
// StackName.CONSUL.getName(), "https://%s:8500/v1/sys/health"
);

private final static int DEFAULT_HTTP_TIMEOUT = 15;

private final static TimeUnit DEFAULT_HTTP_TIMEOUT_UNIT = TimeUnit.SECONDS;

private final static int NUM_SECS_BETWEEN_HEALTH_CHECKS = 5;

private final static int EXPECTED_NUM_SUCCESSES_AFTER_REBOOT = 10;

private final static int EXPECTED_NUM_FAILURES_AFTER_REBOOT = 3;

private final static int HEALTH_CHECK_FAILED_CODE = -1;

private final ConfigStore configStore;

private final CloudFormationService cloudFormationService;

private final Ec2Service ec2Service;

private final AutoScalingService autoScalingService;

private final Proxy proxy;

@Inject
public RollingRebootWithHealthCheckOperation(final ConfigStore configStore,
final CloudFormationService cloudFormationService,
final Ec2Service ec2Service,
final AutoScalingService autoScalingService,
final Proxy proxy) {
this.configStore = configStore;
this.cloudFormationService = cloudFormationService;
this.ec2Service = ec2Service;
this.autoScalingService = autoScalingService;
this.proxy = proxy;
}

@Override
public void run(final RollingRebootWithHealthCheckCommand command) {

logger.warn(Chalk.on(
"If this command fails: the minimum instance size may need to be increased and an EC2 instance" +
" may need to be set to 'in-service' state on the auto scaling group").yellow().toString());

final StackName stackName = command.getStackName();
final String stackId = configStore.getStackId(stackName);
final Map<String, String> stackOutputs = cloudFormationService.getStackOutputs(stackId);

final Map<String, String> stackParameters = cloudFormationService.getStackParameters(stackId);
final int minInstances = Integer.parseInt(stackParameters.get(MIN_INSTANCES_STACK_PARAMETER_KEY));

final String autoScalingGroupId = stackOutputs.get(CloudFormationService.AUTO_SCALING_GROUP_LOGICAL_ID_OUTPUT_KEY);
logger.debug("Found auto scaling group id for stack: {}", stackId);

final Filter isRunningFilter = new Filter(INSTANCE_STATE_FILTER_NAME).withValues(INSTANCE_STATE_RUNNING_FILTER_VALUE);
final List<Instance> instances = ec2Service.getInstancesByTag(EC2_ASG_GROUP_NAME_TAG_KEY, autoScalingGroupId, isRunningFilter);
logger.debug("Found {} instances by tag: '{}:{}'", instances.size(), EC2_ASG_GROUP_NAME_TAG_KEY, autoScalingGroupId);

logger.info("Temporarily decreasing min instances for ASG: {}", autoScalingGroupId);
autoScalingService.updateMinInstancesForAutoScalingGroup(autoScalingGroupId, minInstances - 1);

instances.forEach(instance -> {
rebootInstance(stackName, autoScalingGroupId, instance);
});

logger.info("Increasing min instances for ASG: {}", autoScalingGroupId);
autoScalingService.updateMinInstancesForAutoScalingGroup(autoScalingGroupId, minInstances);
}

/**
* Reboot an instance and make sure it comes back healthy
*/
private void rebootInstance(StackName stackName, String autoScalingGroupId, Instance instance) {
final String instanceId = instance.getInstanceId();
logger.info("Setting instance state to standby: {}", instanceId);
autoScalingService.setInstanceStateToStandby(autoScalingGroupId, instanceId);

logger.info("Rebooting instance: {}", instanceId);
ec2Service.rebootEc2Instance(instanceId);

final String healthCheckUrlTmpl = HEALTH_CHECK_MAP.get(stackName.getName());
final String healthCheckUrl = String.format(healthCheckUrlTmpl, instance.getPublicDnsName());

// wait for health check fail to confirm box reboot
logger.info("Waiting for health check failure to confirm reboot...");
waitForHealthCheckStatusCode(healthCheckUrl, HEALTH_CHECK_FAILED_CODE, EXPECTED_NUM_FAILURES_AFTER_REBOOT);

// wait for health check pass to confirm instance is healthy after reboot
logger.warn(Chalk.on(
"If a proxy is required to talk to the EC2 instance, then make sure it is set up." +
" Otherwise this command will never succeed.").yellow().toString());
logger.info("Waiting for health check to pass again to confirm instance is healthy...");
waitForHealthCheckStatusCode(healthCheckUrl, HttpStatus.OK, EXPECTED_NUM_SUCCESSES_AFTER_REBOOT);

logger.info("Setting instance state to in-service: {}", instanceId);
autoScalingService.setInstanceStateToInService(autoScalingGroupId, instanceId);
}

/**
* Poll the health check 'n' times, looking for the given response
* @param healthCheckUrl - The health check URL
* @param numConsecutiveResponsesExpected - The number of times to poll health check
*/
private void waitForHealthCheckStatusCode(final String healthCheckUrl,
final long expectedStatusCode,
final int numConsecutiveResponsesExpected) {

int responseCode;
int consecutiveResponses = 0;
while (consecutiveResponses < numConsecutiveResponsesExpected) {

responseCode = executeHealthCheck(healthCheckUrl);

if (responseCode == expectedStatusCode) {
consecutiveResponses++;
} else if (consecutiveResponses > 0) {
final String message = Chalk.on("Instance health check did not repeat response code ({}), {} times").red().bold().toString();
logger.debug(message, expectedStatusCode, numConsecutiveResponsesExpected);
consecutiveResponses = 0;
}

try {
TimeUnit.SECONDS.sleep(NUM_SECS_BETWEEN_HEALTH_CHECKS);
} catch (InterruptedException ie) {
logger.error(Chalk.on("Timeout between health checks has been interrupted").red().bold().toString());
return;
}
}
}

/**
* Execute the given health check
* @param healthCheckUrl - Name of that EC2 instance belongs to
* @return - Response code of the health check
*/
private int executeHealthCheck(final String healthCheckUrl) {

final OkHttpClient okHttpClient = new OkHttpClient.Builder()
.hostnameVerifier(new NoopHostnameVerifier())
.proxy(proxy)
.connectTimeout(DEFAULT_HTTP_TIMEOUT, DEFAULT_HTTP_TIMEOUT_UNIT)
.writeTimeout(DEFAULT_HTTP_TIMEOUT, DEFAULT_HTTP_TIMEOUT_UNIT)
.readTimeout(DEFAULT_HTTP_TIMEOUT, DEFAULT_HTTP_TIMEOUT_UNIT)
.build();

final Request requestBuilder = new Request.Builder()
.url(healthCheckUrl)
.get()
.build();

final Call healthCheckCall = okHttpClient.newCall(requestBuilder);

try(final Response response = healthCheckCall.execute()) {
logger.debug("Health check returned status: {}, URL: {}", response.code(), healthCheckUrl);
return response.code();
} catch (IOException ioe) {
final String message = Chalk.on("Health check failed, Cause: {}, URL: {}").red().toString();
logger.debug(message, ioe.getMessage(), healthCheckUrl);
}

return HEALTH_CHECK_FAILED_CODE;
}

@Override
public boolean isRunnable(final RollingRebootWithHealthCheckCommand command) {

final StackName stackName = command.getStackName();
final String stackNameStr = stackName.getName();
final String stackId = configStore.getStackId(stackName);
final Map<String, String> stackParameters = cloudFormationService.getStackParameters(stackId);

if (! HEALTH_CHECK_MAP.containsKey(stackNameStr)) {
logger.error("Cannot reboot cluster: {}. Allowed stacks: {}", stackName, HEALTH_CHECK_MAP.keySet());
return false;
} else if (! stackParameters.containsKey(MIN_INSTANCES_STACK_PARAMETER_KEY)) {
logger.error("Could not find parameter 'minInstances' on stack: {}", stackId);
return false;
} else {
return true;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package com.nike.cerberus.operation.cms;
package com.nike.cerberus.operation.core;

import com.nike.cerberus.command.core.ViewConfigCommand;
import com.nike.cerberus.operation.Operation;
Expand All @@ -26,7 +26,7 @@
import java.util.Optional;

/**
* Gathers all of the CMS environment configuration and puts it in the config bucket.
* Displays the given configuration file from S3.
*/
public class ViewConfigOperation implements Operation<ViewConfigCommand> {

Expand Down
Loading

0 comments on commit 1343720

Please sign in to comment.