Skip to content

Commit

Permalink
Get rid of recursion for tail retries (cloudtools#663)
Browse files Browse the repository at this point in the history
* Get rid of recursion for tail retries

Also, extend the way we retry/timeout, that should work around cloudtools#515.

Not sure of a great way to test this unfortunately.

* Add some tests
  • Loading branch information
phobologic authored Oct 13, 2018
1 parent 5061480 commit 3038512
Show file tree
Hide file tree
Showing 2 changed files with 133 additions and 30 deletions.
59 changes: 35 additions & 24 deletions stacker/providers/aws/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,14 @@
# https://github.com/boto/botocore/blob/1.6.1/botocore/data/_retry.json#L97-L121
MAX_ATTEMPTS = 10

MAX_TAIL_RETRIES = 5
# Updated this to 15 retries with a 1 second sleep between retries. This is
# only used when a call to `get_events` fails due to the stack not being
# found. This is often the case because Cloudformation is taking too long
# to create the stack. 15 seconds should, hopefully, be plenty of time for
# the stack to start showing up in the API.
MAX_TAIL_RETRIES = 15
TAIL_RETRY_SLEEP = 1
GET_EVENTS_SLEEP = 1
DEFAULT_CAPABILITIES = ["CAPABILITY_NAMED_IAM", ]


Expand Down Expand Up @@ -581,56 +588,60 @@ def is_stack_rolling_back(self, stack, **kwargs):
def is_stack_failed(self, stack, **kwargs):
return self.get_stack_status(stack) in self.FAILED_STATUSES

def tail_stack(self, stack, cancel, retries=0, **kwargs):
def log_func(e):
def tail_stack(self, stack, cancel, log_func=None, **kwargs):
def _log_func(e):
event_args = [e['ResourceStatus'], e['ResourceType'],
e.get('ResourceStatusReason', None)]
# filter out any values that are empty
event_args = [arg for arg in event_args if arg]
template = " ".join(["[%s]"] + ["%s" for _ in event_args])
logger.info(template, *([stack.fqn] + event_args))

if not retries:
logger.info("Tailing stack: %s", stack.fqn)
log_func = log_func or _log_func

try:
self.tail(stack.fqn,
cancel=cancel,
log_func=log_func,
include_initial=False)
except botocore.exceptions.ClientError as e:
if "does not exist" in str(e) and retries < MAX_TAIL_RETRIES:
# stack might be in the process of launching, wait for a second
# and try again
time.sleep(1)
self.tail_stack(stack, cancel, retries=retries + 1, **kwargs)
else:
raise
logger.info("Tailing stack: %s", stack.fqn)

attempts = 0
while True:
attempts += 1
try:
self.tail(stack.fqn, cancel=cancel, log_func=log_func,
include_initial=False)
break
except botocore.exceptions.ClientError as e:
if "does not exist" in str(e) and attempts < MAX_TAIL_RETRIES:
# stack might be in the process of launching, wait for a
# second and try again
if cancel.wait(TAIL_RETRY_SLEEP):
return
continue
else:
raise

@staticmethod
def _tail_print(e):
print("%s %s %s" % (e['ResourceStatus'],
e['ResourceType'],
e['EventId']))

def get_events(self, stackname):
def get_events(self, stack_name):
"""Get the events in batches and return in chronological order"""
next_token = None
event_list = []
while 1:
while True:
if next_token is not None:
events = self.cloudformation.describe_stack_events(
StackName=stackname, NextToken=next_token
StackName=stack_name, NextToken=next_token
)
else:
events = self.cloudformation.describe_stack_events(
StackName=stackname
StackName=stack_name
)
event_list.append(events['StackEvents'])
next_token = events.get('NextToken', None)
if next_token is None:
break
time.sleep(1)
time.sleep(GET_EVENTS_SLEEP)
return reversed(sum(event_list, []))

def tail(self, stack_name, cancel, log_func=_tail_print, sleep_time=5,
Expand All @@ -646,7 +657,7 @@ def tail(self, stack_name, cancel, log_func=_tail_print, sleep_time=5,
seen.add(e['EventId'])

# Now keep looping through and dump the new events
while 1:
while True:
events = self.get_events(stack_name)
for e in events:
if e['EventId'] not in seen:
Expand Down
104 changes: 98 additions & 6 deletions stacker/tests/providers/aws/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,24 @@
from datetime import datetime
import random
import string
import threading
import unittest

from mock import patch
from mock import patch, MagicMock
from botocore.stub import Stubber
from botocore.exceptions import ClientError, UnStubbedResponseError
import boto3

from ....actions.diff import DictValue
from stacker.actions.diff import DictValue

from ....providers.base import Template
from ....session_cache import get_session
from stacker.providers.base import Template
from stacker.session_cache import get_session

from ....providers.aws.default import (
from stacker.providers.aws import default

from stacker.providers.aws.default import (
DEFAULT_CAPABILITIES,
MAX_TAIL_RETRIES,
Provider,
requires_replacement,
ask_for_approval,
Expand All @@ -28,7 +33,9 @@
generate_cloudformation_args,
)

from .... import exceptions
from stacker import exceptions

from stacker.stack import Stack


def random_string(length=12):
Expand Down Expand Up @@ -558,6 +565,91 @@ def test_noninteractive_changeset_update_with_stack_policy(self):
parameters=[], stack_policy=Template(body="{}"), tags=[],
)

def test_tail_stack_retry_on_missing_stack(self):
stack_name = "SlowToCreateStack"
stack = MagicMock(spec=Stack)
stack.fqn = "my-namespace-{}".format(stack_name)

default.TAIL_RETRY_SLEEP = .01

# Ensure the stack never appears before we run out of retries
for i in range(MAX_TAIL_RETRIES + 5):
self.stubber.add_client_error(
"describe_stack_events",
service_error_code="ValidationError",
service_message="Stack [{}] does not exist".format(stack_name),
http_status_code=400,
response_meta={"attempt": i + 1},
)

with self.stubber:
try:
self.provider.tail_stack(stack, threading.Event())
except ClientError as exc:
self.assertEqual(
exc.response["ResponseMetadata"]["attempt"],
MAX_TAIL_RETRIES
)

def test_tail_stack_retry_on_missing_stack_eventual_success(self):
stack_name = "SlowToCreateStack"
stack = MagicMock(spec=Stack)
stack.fqn = "my-namespace-{}".format(stack_name)

default.TAIL_RETRY_SLEEP = .01
default.GET_EVENTS_SLEEP = .01

rcvd_events = []

def mock_log_func(e):
rcvd_events.append(e)

def valid_event_response(stack, event_id):
return {
"StackEvents": [
{
"StackId": stack.fqn + "12345",
"EventId": event_id,
"StackName": stack.fqn,
"Timestamp": datetime.now()
},
]
}

# Ensure the stack never appears before we run out of retries
for i in range(3):
self.stubber.add_client_error(
"describe_stack_events",
service_error_code="ValidationError",
service_message="Stack [{}] does not exist".format(stack_name),
http_status_code=400,
response_meta={"attempt": i + 1},
)

self.stubber.add_response(
"describe_stack_events",
valid_event_response(stack, "InitialEvents")
)

self.stubber.add_response(
"describe_stack_events",
valid_event_response(stack, "Event1")
)

with self.stubber:
try:
self.provider.tail_stack(stack, threading.Event(),
log_func=mock_log_func)
except UnStubbedResponseError:
# Eventually we run out of responses - could not happen in
# regular execution
# normally this would just be dealt with when the threads were
# shutdown, but doing so here is a little difficult because
# we can't control the `tail_stack` loop
pass

self.assertEqual(rcvd_events[0]["EventId"], "Event1")


class TestProviderInteractiveMode(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit 3038512

Please sign in to comment.