From b1371b111079d60db07e18caf8b5cf2eaf8dbe42 Mon Sep 17 00:00:00 2001
From: Jacob Oaks <joaks@uber.com>
Date: Tue, 21 May 2024 10:01:41 -0400
Subject: [PATCH] fxtest: Enforceable Timeouts on Lifecycle

In Fx, if starting/stopping the application takes longer than the context's timeout,
the fx.App will bail early, regardless of the status of the currently running hooks.

This prevents stalling an application when hooks (accidentally) block forever.

In order to test hook behavior, Fx provides fxtest.Lifecycle to interact with.
This Lifecycle is a simple wrapper around the actual fx Lifecycle type,
meaning it does check for timeout and bail early like fx.App does.
This is an issue because:
* It allows for long-running hooks (which should be considered bugs) to pass tests.
* It allows for tests to completely stall for hooks that block forever.

This PR adds an option that can be passed to `fxtest.NewLifecycle`
to cause it to immediately fail when context expires, similar to fx.App.

Implementing in this way allows it to be a non-breaking change.

This PR doesn't provide a way to test timeouts using `RequireStart` and
`RequireStop`. However, since those don't take contexts anyways,
my assumption is that usage of those APIs represents an intentional decision
to not care about timeouts by the test writer.

However, if people feel differently, we can instead expose two new APIs
`RequireStartWithContext(ctx)` and `RequireStopWithContext(ctx)` or something
(names obviously up for discussion).
---
 fxtest/lifecycle.go      | 62 +++++++++++++++++++++++++++--
 fxtest/lifecycle_test.go | 84 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 4 deletions(-)

diff --git a/fxtest/lifecycle.go b/fxtest/lifecycle.go
index 111840727..ef7a21070 100644
--- a/fxtest/lifecycle.go
+++ b/fxtest/lifecycle.go
@@ -66,18 +66,45 @@ func (t *panicT) FailNow() {
 	panic("test lifecycle failed")
 }
 
+// LifecycleOption modifies the behavior of the [Lifecycle]
+// when passed to [NewLifecycle].
+type LifecycleOption interface {
+	apply(*Lifecycle)
+}
+
+// EnforceTimeout will cause the [Lifecycle]'s Start and Stop methods
+// to return an error as soon as context expires,
+// regardless of whether specific hooks respect the timeout.
+func EnforceTimeout(enforce bool) LifecycleOption {
+	return &enforceTimeout{
+		enforce: enforce,
+	}
+}
+
+type enforceTimeout struct {
+	enforce bool
+}
+
+func (e *enforceTimeout) apply(lc *Lifecycle) {
+	lc.enforceTimeout = e.enforce
+}
+
+var _ LifecycleOption = (*enforceTimeout)(nil)
+
 // Lifecycle is a testing spy for fx.Lifecycle. It exposes Start and Stop
 // methods (and some test-specific helpers) so that unit tests can exercise
 // hooks.
 type Lifecycle struct {
 	t  TB
 	lc *lifecycle.Lifecycle
+
+	enforceTimeout bool
 }
 
 var _ fx.Lifecycle = (*Lifecycle)(nil)
 
 // NewLifecycle creates a new test lifecycle.
-func NewLifecycle(t TB) *Lifecycle {
+func NewLifecycle(t TB, opts ...LifecycleOption) *Lifecycle {
 	var w io.Writer
 	if t != nil {
 		w = testutil.WriteSyncer{T: t}
@@ -85,15 +112,40 @@ func NewLifecycle(t TB) *Lifecycle {
 		w = os.Stderr
 		t = &panicT{W: os.Stderr}
 	}
-	return &Lifecycle{
+	lc := &Lifecycle{
 		lc: lifecycle.New(fxlog.DefaultLogger(w), fxclock.System),
 		t:  t,
 	}
+	for _, opt := range opts {
+		opt.apply(lc)
+	}
+	return lc
+}
+
+func (l *Lifecycle) withTimeout(ctx context.Context, fn func(context.Context) error) error {
+	if !l.enforceTimeout {
+		return fn(ctx)
+	}
+
+	c := make(chan error, 1)
+	go func() {
+		c <- fn(ctx)
+	}()
+
+	var err error
+	select {
+	case err = <-c:
+	case <-ctx.Done():
+		err = ctx.Err()
+	}
+	return err
 }
 
 // Start executes all registered OnStart hooks in order, halting at the first
 // hook that doesn't succeed.
-func (l *Lifecycle) Start(ctx context.Context) error { return l.lc.Start(ctx) }
+func (l *Lifecycle) Start(ctx context.Context) error {
+	return l.withTimeout(ctx, l.lc.Start)
+}
 
 // RequireStart calls Start with context.Background(), failing the test if an
 // error is encountered.
@@ -114,7 +166,9 @@ func (l *Lifecycle) RequireStart() *Lifecycle {
 // If any hook returns an error, execution continues for a best-effort
 // cleanup. Any errors encountered are collected into a single error and
 // returned.
-func (l *Lifecycle) Stop(ctx context.Context) error { return l.lc.Stop(ctx) }
+func (l *Lifecycle) Stop(ctx context.Context) error {
+	return l.withTimeout(ctx, l.lc.Stop)
+}
 
 // RequireStop calls Stop with context.Background(), failing the test if an error
 // is encountered.
diff --git a/fxtest/lifecycle_test.go b/fxtest/lifecycle_test.go
index a5746b291..2312589ff 100644
--- a/fxtest/lifecycle_test.go
+++ b/fxtest/lifecycle_test.go
@@ -26,6 +26,7 @@ import (
 	"errors"
 	"fmt"
 	"testing"
+	"time"
 
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
@@ -117,6 +118,89 @@ func TestLifecycle(t *testing.T) {
 	})
 }
 
+func TestEnforceTimeout(t *testing.T) {
+	// These tests directly call Start and Stop
+	// rather than RequireStart and RequireStop
+	// because EnforceTimeout does not apply to those.
+
+	t.Run("StartHookTimeout", func(t *testing.T) {
+		t.Parallel()
+
+		wait := make(chan struct{}, 1)
+		defer close(wait)
+
+		spy := newTB()
+		lc := NewLifecycle(spy, EnforceTimeout(true))
+		lc.Append(fx.Hook{
+			OnStart: func(context.Context) error {
+				<-wait
+				return nil
+			},
+			OnStop: func(context.Context) error {
+				return nil
+			},
+		})
+
+		ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+		defer cancel()
+		assert.ErrorIs(t, lc.Start(ctx), context.DeadlineExceeded)
+	})
+
+	t.Run("StopHookTimeout", func(t *testing.T) {
+		t.Parallel()
+
+		wait := make(chan struct{}, 1)
+		defer close(wait)
+
+		spy := newTB()
+		lc := NewLifecycle(spy, EnforceTimeout(true))
+		lc.Append(fx.Hook{
+			OnStart: func(context.Context) error {
+				return nil
+			},
+			OnStop: func(context.Context) error {
+				<-wait
+				return nil
+			},
+		})
+
+		require.NoError(t, lc.Start(context.Background()))
+		ctx, cancel := context.WithTimeout(context.Background(), 100*time.Millisecond)
+		defer cancel()
+		assert.ErrorIs(t, lc.Stop(ctx), context.DeadlineExceeded)
+	})
+
+	t.Run("NoTimeout", func(t *testing.T) {
+		t.Parallel()
+
+		var (
+			started bool
+			stopped bool
+		)
+
+		spy := newTB()
+		lc := NewLifecycle(spy, EnforceTimeout(true))
+		lc.Append(fx.Hook{
+			OnStart: func(context.Context) error {
+				started = true
+				return nil
+			},
+			OnStop: func(context.Context) error {
+				stopped = true
+				return nil
+			},
+		})
+
+		ctx, cancel := context.WithTimeout(context.Background(), time.Hour)
+		defer cancel()
+		require.NoError(t, lc.Start(ctx))
+		require.NoError(t, lc.Stop(ctx))
+		assert.Zero(t, spy.failures)
+		assert.True(t, started)
+		assert.True(t, stopped)
+	})
+}
+
 func TestLifecycle_OptionalT(t *testing.T) {
 	t.Parallel()