From 623dcd97dc832f3dd2e3d23cae1869ea74b1548f Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Thu, 5 Jun 2025 15:54:13 +0200 Subject: [PATCH 01/24] fix(cli): fix flakes related to context cancellation when establishing pg connections (#18246) Since https://github.com/coder/coder/pull/18195 was merged, we started running CLI tests with postgres instead of just dbmem. This surfaced errors related to context cancellation while establishing postgres connections. This PR should fix https://github.com/coder/internal/issues/672. Related to https://github.com/coder/coder/issues/15109. --- cli/clitest/clitest.go | 6 ++++++ cli/server.go | 5 ++++- coderd/database/pubsub/pubsub.go | 15 +++++++-------- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/cli/clitest/clitest.go b/cli/clitest/clitest.go index fbc913e7b81d3..8d1f5302ce7ba 100644 --- a/cli/clitest/clitest.go +++ b/cli/clitest/clitest.go @@ -168,6 +168,12 @@ func StartWithAssert(t *testing.T, inv *serpent.Invocation, assertCallback func( switch { case errors.Is(err, context.Canceled): return + case err != nil && strings.Contains(err.Error(), "driver: bad connection"): + // When we cancel the context on a query that's being executed within + // a transaction, sometimes, instead of a context.Canceled error we get + // a "driver: bad connection" error. + // https://github.com/lib/pq/issues/1137 + return default: assert.NoError(t, err) } diff --git a/cli/server.go b/cli/server.go index 9f55b63fc765c..d9badd02d9fbf 100644 --- a/cli/server.go +++ b/cli/server.go @@ -2359,6 +2359,10 @@ func ConnectToPostgres(ctx context.Context, logger slog.Logger, driver string, d if err != nil { return nil, xerrors.Errorf("get postgres version: %w", err) } + defer version.Close() + if version.Err() != nil { + return nil, xerrors.Errorf("version select: %w", version.Err()) + } if !version.Next() { return nil, xerrors.Errorf("no rows returned for version select") } @@ -2367,7 +2371,6 @@ func ConnectToPostgres(ctx context.Context, logger slog.Logger, driver string, d if err != nil { return nil, xerrors.Errorf("scan version: %w", err) } - _ = version.Close() if versionNum < 130000 { return nil, xerrors.Errorf("PostgreSQL version must be v13.0.0 or higher! Got: %d", versionNum) diff --git a/coderd/database/pubsub/pubsub.go b/coderd/database/pubsub/pubsub.go index 8019754e15bd9..c4b454abdfbda 100644 --- a/coderd/database/pubsub/pubsub.go +++ b/coderd/database/pubsub/pubsub.go @@ -552,15 +552,14 @@ func (p *PGPubsub) startListener(ctx context.Context, connectURL string) error { sentErrCh = true }), } - select { - case err = <-errCh: - if err != nil { - _ = p.pgListener.Close() - return xerrors.Errorf("create pq listener: %w", err) - } - case <-ctx.Done(): + // We don't respect context cancellation here. There's a bug in the pq library + // where if you close the listener before or while the connection is being + // established, the connection will be established anyway, and will not be + // closed. + // https://github.com/lib/pq/issues/1192 + if err := <-errCh; err != nil { _ = p.pgListener.Close() - return ctx.Err() + return xerrors.Errorf("create pq listener: %w", err) } return nil } From 04e4f2fac0a049595a9065072d10f6adc2525aa4 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 5 Jun 2025 16:58:18 +0300 Subject: [PATCH 02/24] chore(agent): update agent proto client (#18242) --- agent/agent.go | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/agent/agent.go b/agent/agent.go index a971c0e7987b6..74cf305c9434a 100644 --- a/agent/agent.go +++ b/agent/agent.go @@ -456,7 +456,7 @@ func (t *trySingleflight) Do(key string, fn func()) { fn() } -func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient24) error { +func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient26) error { tickerDone := make(chan struct{}) collectDone := make(chan struct{}) ctx, cancel := context.WithCancel(ctx) @@ -672,7 +672,7 @@ func (a *agent) reportMetadata(ctx context.Context, aAPI proto.DRPCAgentClient24 // reportLifecycle reports the current lifecycle state once. All state // changes are reported in order. -func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient24) error { +func (a *agent) reportLifecycle(ctx context.Context, aAPI proto.DRPCAgentClient26) error { for { select { case <-a.lifecycleUpdate: @@ -752,7 +752,7 @@ func (a *agent) setLifecycle(state codersdk.WorkspaceAgentLifecycle) { } // reportConnectionsLoop reports connections to the agent for auditing. -func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient24) error { +func (a *agent) reportConnectionsLoop(ctx context.Context, aAPI proto.DRPCAgentClient26) error { for { select { case <-a.reportConnectionsUpdate: @@ -872,7 +872,7 @@ func (a *agent) reportConnection(id uuid.UUID, connectionType proto.Connection_T // fetchServiceBannerLoop fetches the service banner on an interval. It will // not be fetched immediately; the expectation is that it is primed elsewhere // (and must be done before the session actually starts). -func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient24) error { +func (a *agent) fetchServiceBannerLoop(ctx context.Context, aAPI proto.DRPCAgentClient26) error { ticker := time.NewTicker(a.announcementBannersRefreshInterval) defer ticker.Stop() for { @@ -925,7 +925,7 @@ func (a *agent) run() (retErr error) { connMan := newAPIConnRoutineManager(a.gracefulCtx, a.hardCtx, a.logger, aAPI, tAPI) connMan.startAgentAPI("init notification banners", gracefulShutdownBehaviorStop, - func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { + func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { bannersProto, err := aAPI.GetAnnouncementBanners(ctx, &proto.GetAnnouncementBannersRequest{}) if err != nil { return xerrors.Errorf("fetch service banner: %w", err) @@ -942,7 +942,7 @@ func (a *agent) run() (retErr error) { // sending logs gets gracefulShutdownBehaviorRemain because we want to send logs generated by // shutdown scripts. connMan.startAgentAPI("send logs", gracefulShutdownBehaviorRemain, - func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { + func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { err := a.logSender.SendLoop(ctx, aAPI) if xerrors.Is(err, agentsdk.ErrLogLimitExceeded) { // we don't want this error to tear down the API connection and propagate to the @@ -961,7 +961,7 @@ func (a *agent) run() (retErr error) { connMan.startAgentAPI("report metadata", gracefulShutdownBehaviorStop, a.reportMetadata) // resources monitor can cease as soon as we start gracefully shutting down. - connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { + connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { logger := a.logger.Named("resources_monitor") clk := quartz.NewReal() config, err := aAPI.GetResourcesMonitoringConfiguration(ctx, &proto.GetResourcesMonitoringConfigurationRequest{}) @@ -1008,7 +1008,7 @@ func (a *agent) run() (retErr error) { connMan.startAgentAPI("handle manifest", gracefulShutdownBehaviorStop, a.handleManifest(manifestOK)) connMan.startAgentAPI("app health reporter", gracefulShutdownBehaviorStop, - func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { + func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { if err := manifestOK.wait(ctx); err != nil { return xerrors.Errorf("no manifest: %w", err) } @@ -1041,7 +1041,7 @@ func (a *agent) run() (retErr error) { connMan.startAgentAPI("fetch service banner loop", gracefulShutdownBehaviorStop, a.fetchServiceBannerLoop) - connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { + connMan.startAgentAPI("stats report loop", gracefulShutdownBehaviorStop, func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { if err := networkOK.wait(ctx); err != nil { return xerrors.Errorf("no network: %w", err) } @@ -1056,8 +1056,8 @@ func (a *agent) run() (retErr error) { } // handleManifest returns a function that fetches and processes the manifest -func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { - return func(ctx context.Context, aAPI proto.DRPCAgentClient24) error { +func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { + return func(ctx context.Context, aAPI proto.DRPCAgentClient26) error { var ( sentResult = false err error @@ -1187,8 +1187,8 @@ func (a *agent) handleManifest(manifestOK *checkpoint) func(ctx context.Context, // createOrUpdateNetwork waits for the manifest to be set using manifestOK, then creates or updates // the tailnet using the information in the manifest -func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient24) error { - return func(ctx context.Context, _ proto.DRPCAgentClient24) (retErr error) { +func (a *agent) createOrUpdateNetwork(manifestOK, networkOK *checkpoint) func(context.Context, proto.DRPCAgentClient26) error { + return func(ctx context.Context, _ proto.DRPCAgentClient26) (retErr error) { if err := manifestOK.wait(ctx); err != nil { return xerrors.Errorf("no manifest: %w", err) } @@ -1960,7 +1960,7 @@ const ( type apiConnRoutineManager struct { logger slog.Logger - aAPI proto.DRPCAgentClient24 + aAPI proto.DRPCAgentClient26 tAPI tailnetproto.DRPCTailnetClient24 eg *errgroup.Group stopCtx context.Context @@ -1969,7 +1969,7 @@ type apiConnRoutineManager struct { func newAPIConnRoutineManager( gracefulCtx, hardCtx context.Context, logger slog.Logger, - aAPI proto.DRPCAgentClient24, tAPI tailnetproto.DRPCTailnetClient24, + aAPI proto.DRPCAgentClient26, tAPI tailnetproto.DRPCTailnetClient24, ) *apiConnRoutineManager { // routines that remain in operation during graceful shutdown use the remainCtx. They'll still // exit if the errgroup hits an error, which usually means a problem with the conn. @@ -2002,7 +2002,7 @@ func newAPIConnRoutineManager( // but for Tailnet. func (a *apiConnRoutineManager) startAgentAPI( name string, behavior gracefulShutdownBehavior, - f func(context.Context, proto.DRPCAgentClient24) error, + f func(context.Context, proto.DRPCAgentClient26) error, ) { logger := a.logger.With(slog.F("name", name)) var ctx context.Context From c54ef0357b4e7fe901f4300fd3a5fa18c31f4686 Mon Sep 17 00:00:00 2001 From: "blink-so[bot]" <211532188+blink-so[bot]@users.noreply.github.com> Date: Thu, 5 Jun 2025 14:30:02 +0000 Subject: [PATCH 03/24] docs: update Smarter Device Manager link to new GitHub repository (#18248) The Smarter Device Manager project has moved from GitLab to GitHub. This PR updates the link in the Docker in Workspaces documentation to point to the new repository at https://github.com/smarter-project/smarter-device-manager. **Changes:** - Updated the link from `https://gitlab.com/arm-research/smarter/smarter-device-manager#enabling-access` to `https://github.com/smarter-project/smarter-device-manager` --------- Co-authored-by: blink-so[bot] <211532188+blink-so[bot]@users.noreply.github.com> Co-authored-by: Edward Angert --- .../admin/templates/extending-templates/docker-in-workspaces.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/admin/templates/extending-templates/docker-in-workspaces.md b/docs/admin/templates/extending-templates/docker-in-workspaces.md index 51b1634d20371..073049ba0ecdc 100644 --- a/docs/admin/templates/extending-templates/docker-in-workspaces.md +++ b/docs/admin/templates/extending-templates/docker-in-workspaces.md @@ -200,7 +200,7 @@ Before using Podman, please review the following documentation: - [Shortcomings of Rootless Podman](https://github.com/containers/podman/blob/main/rootless.md#shortcomings-of-rootless-podman) 1. Enable - [smart-device-manager](https://gitlab.com/arm-research/smarter/smarter-device-manager#enabling-access) + [smart-device-manager](https://github.com/smarter-project/smarter-device-manager#enabling-access) to securely expose a FUSE devices to pods. ```shell From 2c0c58af7163ce616f4466701b2335fff7f12f0d Mon Sep 17 00:00:00 2001 From: "blink-so[bot]" <211532188+blink-so[bot]@users.noreply.github.com> Date: Thu, 5 Jun 2025 10:35:17 -0400 Subject: [PATCH 04/24] fix: improve copy for tasks "starting" page (#18247) Update the copy on the task starting page to be more user-friendly: - Change "Building the workspace" to "Starting your workspace" - Change "Your task will run as soon as the workspace is ready" to "This should take a few minutes" The new copy provides clearer expectations about timing and uses more user-friendly language. Fixes #18164 Co-authored-by: blink-so[bot] <211532188+blink-so[bot]@users.noreply.github.com> --- site/src/pages/TaskPage/TaskPage.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/site/src/pages/TaskPage/TaskPage.tsx b/site/src/pages/TaskPage/TaskPage.tsx index 287568852d209..ea32ea5d43f40 100644 --- a/site/src/pages/TaskPage/TaskPage.tsx +++ b/site/src/pages/TaskPage/TaskPage.tsx @@ -92,10 +92,10 @@ const TaskPage = () => {

- Building the workspace + Starting your workspace

- Your task will run as soon as the workspace is ready + This should take a few minutes
From 2f8b056e271df2ac57704585789b13079922174f Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Thu, 5 Jun 2025 17:53:35 +0300 Subject: [PATCH 05/24] test(agent/agentcontainers): fix test data race due to list manipulation (#18250) Fixes coder/internal#675 --- agent/agentcontainers/api.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/agent/agentcontainers/api.go b/agent/agentcontainers/api.go index 349b85e3d269f..8fff664c0b0f7 100644 --- a/agent/agentcontainers/api.go +++ b/agent/agentcontainers/api.go @@ -378,6 +378,8 @@ func (api *API) updateContainers(ctx context.Context) error { return xerrors.Errorf("list containers failed: %w", err) } + // Clone to avoid test flakes due to data manipulation. + updated.Containers = slices.Clone(updated.Containers) api.mu.Lock() defer api.mu.Unlock() From 3f406c7c50148a2e71aab35b7b8348b6e63f9918 Mon Sep 17 00:00:00 2001 From: Hugo Dutka Date: Thu, 5 Jun 2025 18:58:54 +0200 Subject: [PATCH 06/24] fix: improve sidebar chat loading UX in tasks (#18254) Addresses https://github.com/coder/internal/issues/668 --- site/src/pages/TaskPage/TaskSidebar.tsx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/site/src/pages/TaskPage/TaskSidebar.tsx b/site/src/pages/TaskPage/TaskSidebar.tsx index 872d64a60cbca..e1d31b8e6b33c 100644 --- a/site/src/pages/TaskPage/TaskSidebar.tsx +++ b/site/src/pages/TaskPage/TaskSidebar.tsx @@ -40,6 +40,8 @@ export const TaskSidebar: FC = ({ task }) => { .flatMap((r) => r.agents) .flatMap((a) => a?.apps) .find((a) => a?.slug === AI_APP_CHAT_SLUG); + const showChatApp = + chatApp && (chatApp.health === "disabled" || chatApp.health === "healthy"); return (