Skip to content

Commit dfe8efc

Browse files
authored
fix: use background context for inmem provisionerd (#11545)
This test case fails with an error log, showing "context canceled" when trying to send an acquired job to an in-mem provisionerd. https://github.com/coder/coder/runs/20331469006 In this case, we don't want to supress this error, since it could mean that we acquired a job, locked it in the database, then failed to send it to a provisioner. (We also don't want to mark the job as failed because we don't know whether the job made it to the provisionerd or not --- in the failed test you can see that the job is actually processed just fine). The reason we got context canceled is because the API was shutting down --- we don't want provisionerdserver to abruptly stop processing job stuff as the API shuts down as this will leave jobs in a bad state. This PR fixes up the use of contexts with provisionerdserver and the associated drpc service calls.
1 parent c125206 commit dfe8efc

File tree

3 files changed

+22
-10
lines changed

3 files changed

+22
-10
lines changed

cli/server.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -1371,10 +1371,10 @@ func newProvisionerDaemon(
13711371
connector[string(database.ProvisionerTypeTerraform)] = sdkproto.NewDRPCProvisionerClient(terraformClient)
13721372
}
13731373

1374-
return provisionerd.New(func(ctx context.Context) (proto.DRPCProvisionerDaemonClient, error) {
1374+
return provisionerd.New(func(dialCtx context.Context) (proto.DRPCProvisionerDaemonClient, error) {
13751375
// This debounces calls to listen every second. Read the comment
13761376
// in provisionerdserver.go to learn more!
1377-
return coderAPI.CreateInMemoryProvisionerDaemon(ctx, name)
1377+
return coderAPI.CreateInMemoryProvisionerDaemon(dialCtx, name)
13781378
}, &provisionerd.Options{
13791379
Logger: logger.Named(fmt.Sprintf("provisionerd-%s", name)),
13801380
UpdateInterval: time.Second,

coderd/coderd.go

+18-6
Original file line numberDiff line numberDiff line change
@@ -1174,7 +1174,7 @@ func compressHandler(h http.Handler) http.Handler {
11741174

11751175
// CreateInMemoryProvisionerDaemon is an in-memory connection to a provisionerd.
11761176
// Useful when starting coderd and provisionerd in the same process.
1177-
func (api *API) CreateInMemoryProvisionerDaemon(ctx context.Context, name string) (client proto.DRPCProvisionerDaemonClient, err error) {
1177+
func (api *API) CreateInMemoryProvisionerDaemon(dialCtx context.Context, name string) (client proto.DRPCProvisionerDaemonClient, err error) {
11781178
tracer := api.TracerProvider.Tracer(tracing.TracerName)
11791179
clientSession, serverSession := drpc.MemTransportPipe()
11801180
defer func() {
@@ -1185,7 +1185,7 @@ func (api *API) CreateInMemoryProvisionerDaemon(ctx context.Context, name string
11851185
}()
11861186

11871187
//nolint:gocritic // in-memory provisioners are owned by system
1188-
daemon, err := api.Database.UpsertProvisionerDaemon(dbauthz.AsSystemRestricted(ctx), database.UpsertProvisionerDaemonParams{
1188+
daemon, err := api.Database.UpsertProvisionerDaemon(dbauthz.AsSystemRestricted(dialCtx), database.UpsertProvisionerDaemonParams{
11891189
Name: name,
11901190
CreatedAt: dbtime.Now(),
11911191
Provisioners: []database.ProvisionerType{
@@ -1201,7 +1201,7 @@ func (api *API) CreateInMemoryProvisionerDaemon(ctx context.Context, name string
12011201
}
12021202

12031203
mux := drpcmux.New()
1204-
api.Logger.Info(ctx, "starting in-memory provisioner daemon", slog.F("name", name))
1204+
api.Logger.Info(dialCtx, "starting in-memory provisioner daemon", slog.F("name", name))
12051205
logger := api.Logger.Named(fmt.Sprintf("inmem-provisionerd-%s", name))
12061206
srv, err := provisionerdserver.NewServer(
12071207
api.ctx, // use the same ctx as the API
@@ -1238,13 +1238,25 @@ func (api *API) CreateInMemoryProvisionerDaemon(ctx context.Context, name string
12381238
if xerrors.Is(err, io.EOF) {
12391239
return
12401240
}
1241-
logger.Debug(ctx, "drpc server error", slog.Error(err))
1241+
logger.Debug(dialCtx, "drpc server error", slog.Error(err))
12421242
},
12431243
},
12441244
)
1245+
// in-mem pipes aren't technically "websockets" but they have the same properties as far as the
1246+
// API is concerned: they are long-lived connections that we need to close before completing
1247+
// shutdown of the API.
1248+
api.WebsocketWaitMutex.Lock()
1249+
api.WebsocketWaitGroup.Add(1)
1250+
api.WebsocketWaitMutex.Unlock()
12451251
go func() {
1246-
err := server.Serve(ctx, serverSession)
1247-
logger.Info(ctx, "provisioner daemon disconnected", slog.Error(err))
1252+
defer api.WebsocketWaitGroup.Done()
1253+
// here we pass the background context, since we want the server to keep serving until the
1254+
// client hangs up. If we, say, pass the API context, then when it is canceled, we could
1255+
// drop a job that we locked in the database but never passed to the provisionerd. The
1256+
// provisionerd is local, in-mem, so there isn't a danger of losing contact with it and
1257+
// having a dead connection we don't know the status of.
1258+
err := server.Serve(context.Background(), serverSession)
1259+
logger.Info(dialCtx, "provisioner daemon disconnected", slog.Error(err))
12481260
// close the sessions, so we don't leak goroutines serving them.
12491261
_ = clientSession.Close()
12501262
_ = serverSession.Close()

coderd/coderdtest/coderdtest.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -532,8 +532,8 @@ func NewProvisionerDaemon(t testing.TB, coderAPI *coderd.API) io.Closer {
532532
assert.NoError(t, err)
533533
}()
534534

535-
daemon := provisionerd.New(func(ctx context.Context) (provisionerdproto.DRPCProvisionerDaemonClient, error) {
536-
return coderAPI.CreateInMemoryProvisionerDaemon(ctx, "test")
535+
daemon := provisionerd.New(func(dialCtx context.Context) (provisionerdproto.DRPCProvisionerDaemonClient, error) {
536+
return coderAPI.CreateInMemoryProvisionerDaemon(dialCtx, "test")
537537
}, &provisionerd.Options{
538538
Logger: coderAPI.Logger.Named("provisionerd").Leveled(slog.LevelDebug),
539539
UpdateInterval: 250 * time.Millisecond,

0 commit comments

Comments
 (0)