diff --git a/agent/agent_test.go b/agent/agent_test.go index 029fbb0f8ea32..6c0feca812e8b 100644 --- a/agent/agent_test.go +++ b/agent/agent_test.go @@ -2188,14 +2188,14 @@ func TestAgent_DevcontainerRecreate(t *testing.T) { t.Logf("Looking for container with label: devcontainer.local_folder=%s", workspaceFolder) - var container docker.APIContainers + var container codersdk.WorkspaceAgentContainer testutil.Eventually(ctx, t, func(context.Context) bool { - containers, err := pool.Client.ListContainers(docker.ListContainersOptions{All: true}) + resp, err := conn.ListContainers(ctx) if err != nil { t.Logf("Error listing containers: %v", err) return false } - for _, c := range containers { + for _, c := range resp.Containers { t.Logf("Found container: %s with labels: %v", c.ID[:12], c.Labels) if v, ok := c.Labels["devcontainer.local_folder"]; ok && v == workspaceFolder { t.Logf("Found matching container: %s", c.ID[:12]) @@ -2205,7 +2205,7 @@ func TestAgent_DevcontainerRecreate(t *testing.T) { } return false }, testutil.IntervalMedium, "no container with workspace folder label found") - defer func(container docker.APIContainers) { + defer func(container codersdk.WorkspaceAgentContainer) { // We can't rely on pool here because the container is not // managed by it (it is managed by @devcontainer/cli). err := pool.Client.RemoveContainer(docker.RemoveContainerOptions{ @@ -2225,7 +2225,7 @@ func TestAgent_DevcontainerRecreate(t *testing.T) { // Invoke recreate to trigger the destruction and recreation of the // devcontainer, we do it in a goroutine so we can process logs // concurrently. - go func(container docker.APIContainers) { + go func(container codersdk.WorkspaceAgentContainer) { err := conn.RecreateDevcontainer(ctx, container.ID) assert.NoError(t, err, "recreate devcontainer should succeed") }(container) @@ -2253,12 +2253,12 @@ waitForOutcomeLoop: // Make sure the container exists and isn't the same as the old one. testutil.Eventually(ctx, t, func(context.Context) bool { - containers, err := pool.Client.ListContainers(docker.ListContainersOptions{All: true}) + resp, err := conn.ListContainers(ctx) if err != nil { t.Logf("Error listing containers: %v", err) return false } - for _, c := range containers { + for _, c := range resp.Containers { t.Logf("Found container: %s with labels: %v", c.ID[:12], c.Labels) if v, ok := c.Labels["devcontainer.local_folder"]; ok && v == workspaceFolder { if c.ID == container.ID { @@ -2272,7 +2272,7 @@ waitForOutcomeLoop: } return false }, testutil.IntervalMedium, "new devcontainer not found") - defer func(container docker.APIContainers) { + defer func(container codersdk.WorkspaceAgentContainer) { // We can't rely on pool here because the container is not // managed by it (it is managed by @devcontainer/cli). err := pool.Client.RemoveContainer(docker.RemoveContainerOptions{ diff --git a/agent/agentcontainers/api.go b/agent/agentcontainers/api.go index 7fd42175db7d4..b28e39ad8c57b 100644 --- a/agent/agentcontainers/api.go +++ b/agent/agentcontainers/api.go @@ -50,13 +50,16 @@ type API struct { mu sync.RWMutex closed bool - containers codersdk.WorkspaceAgentListContainersResponse // Output from the last list operation. - containersErr error // Error from the last list operation. - devcontainerNames map[string]struct{} - knownDevcontainers []codersdk.WorkspaceAgentDevcontainer - configFileModifiedTimes map[string]time.Time - - devcontainerLogSourceIDs map[string]uuid.UUID // Track devcontainer log source IDs. + containers codersdk.WorkspaceAgentListContainersResponse // Output from the last list operation. + containersErr error // Error from the last list operation. + devcontainerNames map[string]bool // By devcontainer name. + knownDevcontainers map[string]codersdk.WorkspaceAgentDevcontainer // By workspace folder. + configFileModifiedTimes map[string]time.Time // By config file path. + recreateSuccessTimes map[string]time.Time // By workspace folder. + recreateErrorTimes map[string]time.Time // By workspace folder. + recreateWg sync.WaitGroup + + devcontainerLogSourceIDs map[string]uuid.UUID // By workspace folder. } // Option is a functional option for API. @@ -101,24 +104,26 @@ func WithDevcontainers(devcontainers []codersdk.WorkspaceAgentDevcontainer, scri if len(devcontainers) == 0 { return } - api.knownDevcontainers = slices.Clone(devcontainers) - api.devcontainerNames = make(map[string]struct{}, len(devcontainers)) + api.knownDevcontainers = make(map[string]codersdk.WorkspaceAgentDevcontainer, len(devcontainers)) + api.devcontainerNames = make(map[string]bool, len(devcontainers)) api.devcontainerLogSourceIDs = make(map[string]uuid.UUID) - for _, devcontainer := range devcontainers { - api.devcontainerNames[devcontainer.Name] = struct{}{} + for _, dc := range devcontainers { + api.knownDevcontainers[dc.WorkspaceFolder] = dc + api.devcontainerNames[dc.Name] = true for _, script := range scripts { // The devcontainer scripts match the devcontainer ID for // identification. - if script.ID == devcontainer.ID { - api.devcontainerLogSourceIDs[devcontainer.WorkspaceFolder] = script.LogSourceID + if script.ID == dc.ID { + api.devcontainerLogSourceIDs[dc.WorkspaceFolder] = script.LogSourceID break } } - if api.devcontainerLogSourceIDs[devcontainer.WorkspaceFolder] == uuid.Nil { + if api.devcontainerLogSourceIDs[dc.WorkspaceFolder] == uuid.Nil { api.logger.Error(api.ctx, "devcontainer log source ID not found for devcontainer", - slog.F("devcontainer", devcontainer.Name), - slog.F("workspace_folder", devcontainer.WorkspaceFolder), - slog.F("config_path", devcontainer.ConfigPath), + slog.F("devcontainer_id", dc.ID), + slog.F("devcontainer_name", dc.Name), + slog.F("workspace_folder", dc.WorkspaceFolder), + slog.F("config_path", dc.ConfigPath), ) } } @@ -169,9 +174,11 @@ func NewAPI(logger slog.Logger, options ...Option) *API { logger: logger, clock: quartz.NewReal(), execer: agentexec.DefaultExecer, - devcontainerNames: make(map[string]struct{}), - knownDevcontainers: []codersdk.WorkspaceAgentDevcontainer{}, + devcontainerNames: make(map[string]bool), + knownDevcontainers: make(map[string]codersdk.WorkspaceAgentDevcontainer), configFileModifiedTimes: make(map[string]time.Time), + recreateSuccessTimes: make(map[string]time.Time), + recreateErrorTimes: make(map[string]time.Time), scriptLogger: func(uuid.UUID) ScriptLogger { return noopScriptLogger{} }, } // The ctx and logger must be set before applying options to avoid @@ -223,7 +230,7 @@ func (api *API) watcherLoop() { continue } - now := api.clock.Now() + now := api.clock.Now("watcherLoop") switch { case event.Has(fsnotify.Create | fsnotify.Write): api.logger.Debug(api.ctx, "devcontainer config file changed", slog.F("file", event.Name)) @@ -386,20 +393,18 @@ func (api *API) updateContainers(ctx context.Context) error { // on the latest list of containers. This method assumes that api.mu is // held. func (api *API) processUpdatedContainersLocked(ctx context.Context, updated codersdk.WorkspaceAgentListContainersResponse) { - dirtyStates := make(map[string]bool) - // Reset all known devcontainers to not running. - for i := range api.knownDevcontainers { - api.knownDevcontainers[i].Running = false - api.knownDevcontainers[i].Container = nil - - // Preserve the dirty state and store in map for lookup. - dirtyStates[api.knownDevcontainers[i].WorkspaceFolder] = api.knownDevcontainers[i].Dirty + // Reset the container links in known devcontainers to detect if + // they still exist. + for _, dc := range api.knownDevcontainers { + dc.Container = nil + api.knownDevcontainers[dc.WorkspaceFolder] = dc } // Check if the container is running and update the known devcontainers. - updatedDevcontainers := make(map[string]bool) for i := range updated.Containers { - container := &updated.Containers[i] + container := &updated.Containers[i] // Grab a reference to the container to allow mutating it. + container.DevcontainerDirty = false // Reset dirty state for the container (updated later). + workspaceFolder := container.Labels[DevcontainerLocalFolderLabel] configFile := container.Labels[DevcontainerConfigFileLabel] @@ -407,89 +412,83 @@ func (api *API) processUpdatedContainersLocked(ctx context.Context, updated code continue } - if lastModified, hasModTime := api.configFileModifiedTimes[configFile]; !hasModTime || container.CreatedAt.Before(lastModified) { - api.logger.Debug(ctx, "container created before config modification, setting dirty state from devcontainer", - slog.F("container", container.ID), - slog.F("created_at", container.CreatedAt), - slog.F("config_modified_at", lastModified), - slog.F("file", configFile), - slog.F("workspace_folder", workspaceFolder), - slog.F("dirty", dirtyStates[workspaceFolder]), - ) - container.DevcontainerDirty = dirtyStates[workspaceFolder] - } - - // Check if this is already in our known list. - if knownIndex := slices.IndexFunc(api.knownDevcontainers, func(dc codersdk.WorkspaceAgentDevcontainer) bool { - return dc.WorkspaceFolder == workspaceFolder - }); knownIndex != -1 { - // Update existing entry with runtime information. - dc := &api.knownDevcontainers[knownIndex] - if configFile != "" && dc.ConfigPath == "" { + if dc, ok := api.knownDevcontainers[workspaceFolder]; ok { + // If no config path is set, this devcontainer was defined + // in Terraform without the optional config file. Assume the + // first container with the workspace folder label is the + // one we want to use. + if dc.ConfigPath == "" && configFile != "" { dc.ConfigPath = configFile if err := api.watcher.Add(configFile); err != nil { api.logger.Error(ctx, "watch devcontainer config file failed", slog.Error(err), slog.F("file", configFile)) } } - dc.Running = container.Running + dc.Container = container - dc.Dirty = container.DevcontainerDirty - updatedDevcontainers[workspaceFolder] = true + api.knownDevcontainers[dc.WorkspaceFolder] = dc continue } // NOTE(mafredri): This name impl. may change to accommodate devcontainer agents RFC. // If not in our known list, add as a runtime detected entry. name := path.Base(workspaceFolder) - if _, ok := api.devcontainerNames[name]; ok { + if api.devcontainerNames[name] { // Try to find a unique name by appending a number. for i := 2; ; i++ { newName := fmt.Sprintf("%s-%d", name, i) - if _, ok := api.devcontainerNames[newName]; !ok { + if !api.devcontainerNames[newName] { name = newName break } } } - api.devcontainerNames[name] = struct{}{} + api.devcontainerNames[name] = true if configFile != "" { if err := api.watcher.Add(configFile); err != nil { api.logger.Error(ctx, "watch devcontainer config file failed", slog.Error(err), slog.F("file", configFile)) } } - api.knownDevcontainers = append(api.knownDevcontainers, codersdk.WorkspaceAgentDevcontainer{ + api.knownDevcontainers[workspaceFolder] = codersdk.WorkspaceAgentDevcontainer{ ID: uuid.New(), Name: name, WorkspaceFolder: workspaceFolder, ConfigPath: configFile, - Running: container.Running, - Dirty: container.DevcontainerDirty, + Status: "", // Updated later based on container state. + Dirty: false, // Updated later based on config file changes. Container: container, - }) - updatedDevcontainers[workspaceFolder] = true + } } - for i := range api.knownDevcontainers { - if _, ok := updatedDevcontainers[api.knownDevcontainers[i].WorkspaceFolder]; ok { - continue - } + // Iterate through all known devcontainers and update their status + // based on the current state of the containers. + for _, dc := range api.knownDevcontainers { + switch { + case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusStarting: + continue // This state is handled by the recreation routine. - dc := &api.knownDevcontainers[i] + case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusError && (dc.Container == nil || dc.Container.CreatedAt.Before(api.recreateErrorTimes[dc.WorkspaceFolder])): + continue // The devcontainer needs to be recreated. - if !dc.Running && !dc.Dirty && dc.Container == nil { - // Already marked as not running, skip. - continue + case dc.Container != nil: + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped + if dc.Container.Running { + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning + } + + dc.Dirty = false + if lastModified, hasModTime := api.configFileModifiedTimes[dc.ConfigPath]; hasModTime && dc.Container.CreatedAt.Before(lastModified) { + dc.Dirty = true + } + dc.Container.DevcontainerDirty = dc.Dirty + + case dc.Container == nil: + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped + dc.Dirty = false } - api.logger.Debug(ctx, "devcontainer is not running anymore, marking as not running", - slog.F("workspace_folder", dc.WorkspaceFolder), - slog.F("config_path", dc.ConfigPath), - slog.F("name", dc.Name), - ) - dc.Running = false - dc.Dirty = false - dc.Container = nil + delete(api.recreateErrorTimes, dc.WorkspaceFolder) + api.knownDevcontainers[dc.WorkspaceFolder] = dc } api.containers = updated @@ -559,9 +558,7 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques return } - containerIdx := slices.IndexFunc(containers.Containers, func(c codersdk.WorkspaceAgentContainer) bool { - return c.Match(containerID) - }) + containerIdx := slices.IndexFunc(containers.Containers, func(c codersdk.WorkspaceAgentContainer) bool { return c.Match(containerID) }) if containerIdx == -1 { httpapi.Write(ctx, w, http.StatusNotFound, codersdk.Response{ Message: "Container not found", @@ -584,18 +581,85 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques return } + api.mu.Lock() + + dc, ok := api.knownDevcontainers[workspaceFolder] + switch { + case !ok: + api.mu.Unlock() + + // This case should not happen if the container is a valid devcontainer. + api.logger.Error(ctx, "devcontainer not found for workspace folder", slog.F("workspace_folder", workspaceFolder)) + httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{ + Message: "Devcontainer not found.", + Detail: fmt.Sprintf("Could not find devcontainer for workspace folder: %q", workspaceFolder), + }) + return + case dc.Status == codersdk.WorkspaceAgentDevcontainerStatusStarting: + api.mu.Unlock() + + httpapi.Write(ctx, w, http.StatusConflict, codersdk.Response{ + Message: "Devcontainer recreation already in progress", + Detail: fmt.Sprintf("Recreation for workspace folder %q is already underway.", dc.WorkspaceFolder), + }) + return + } + + // Update the status so that we don't try to recreate the + // devcontainer multiple times in parallel. + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStarting + api.knownDevcontainers[dc.WorkspaceFolder] = dc + api.recreateWg.Add(1) + go api.recreateDevcontainer(dc, configPath) + + api.mu.Unlock() + + httpapi.Write(ctx, w, http.StatusAccepted, codersdk.Response{ + Message: "Devcontainer recreation initiated", + Detail: fmt.Sprintf("Recreation process for workspace folder %q has started.", dc.WorkspaceFolder), + }) +} + +// recreateDevcontainer should run in its own goroutine and is responsible for +// recreating a devcontainer based on the provided devcontainer configuration. +// It updates the devcontainer status and logs the process. The configPath is +// passed as a parameter for the odd chance that the container being recreated +// has a different config file than the one stored in the devcontainer state. +// The devcontainer state must be set to starting and the recreateWg must be +// incremented before calling this function. +func (api *API) recreateDevcontainer(dc codersdk.WorkspaceAgentDevcontainer, configPath string) { + defer api.recreateWg.Done() + + var ( + err error + ctx = api.ctx + logger = api.logger.With( + slog.F("devcontainer_id", dc.ID), + slog.F("devcontainer_name", dc.Name), + slog.F("workspace_folder", dc.WorkspaceFolder), + slog.F("config_path", configPath), + ) + ) + + if dc.ConfigPath != configPath { + logger.Warn(ctx, "devcontainer config path mismatch", + slog.F("config_path_param", configPath), + ) + } + // Send logs via agent logging facilities. - logSourceID := api.devcontainerLogSourceIDs[workspaceFolder] + logSourceID := api.devcontainerLogSourceIDs[dc.WorkspaceFolder] if logSourceID == uuid.Nil { // Fallback to the external log source ID if not found. logSourceID = agentsdk.ExternalLogSourceID } + scriptLogger := api.scriptLogger(logSourceID) defer func() { flushCtx, cancel := context.WithTimeout(api.ctx, 5*time.Second) defer cancel() if err := scriptLogger.Flush(flushCtx); err != nil { - api.logger.Error(flushCtx, "flush devcontainer logs failed", slog.Error(err)) + logger.Error(flushCtx, "flush devcontainer logs failed during recreation", slog.Error(err)) } }() infoW := agentsdk.LogsWriter(ctx, scriptLogger.Send, logSourceID, codersdk.LogLevelInfo) @@ -603,21 +667,49 @@ func (api *API) handleDevcontainerRecreate(w http.ResponseWriter, r *http.Reques errW := agentsdk.LogsWriter(ctx, scriptLogger.Send, logSourceID, codersdk.LogLevelError) defer errW.Close() - _, err = api.dccli.Up(ctx, workspaceFolder, configPath, WithOutput(infoW, errW), WithRemoveExistingContainer()) + logger.Debug(ctx, "starting devcontainer recreation") + + _, err = api.dccli.Up(ctx, dc.WorkspaceFolder, configPath, WithOutput(infoW, errW), WithRemoveExistingContainer()) if err != nil { - httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{ - Message: "Could not recreate devcontainer", - Detail: err.Error(), - }) + // No need to log if the API is closing (context canceled), as this + // is expected behavior when the API is shutting down. + if !errors.Is(err, context.Canceled) { + logger.Error(ctx, "devcontainer recreation failed", slog.Error(err)) + } + + api.mu.Lock() + dc = api.knownDevcontainers[dc.WorkspaceFolder] + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusError + api.knownDevcontainers[dc.WorkspaceFolder] = dc + api.recreateErrorTimes[dc.WorkspaceFolder] = api.clock.Now("recreate", "errorTimes") + api.mu.Unlock() return } - // NOTE(mafredri): This won't be needed once recreation is done async. - if err := api.refreshContainers(r.Context()); err != nil { - api.logger.Error(ctx, "failed to trigger immediate refresh after devcontainer recreation", slog.Error(err)) + logger.Info(ctx, "devcontainer recreated successfully") + + api.mu.Lock() + dc = api.knownDevcontainers[dc.WorkspaceFolder] + // Update the devcontainer status to Running or Stopped based on the + // current state of the container, changing the status to !starting + // allows the update routine to update the devcontainer status, but + // to minimize the time between API consistency, we guess the status + // based on the container state. + if dc.Container != nil && dc.Container.Running { + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusRunning + } else { + dc.Status = codersdk.WorkspaceAgentDevcontainerStatusStopped } + dc.Dirty = false + api.recreateSuccessTimes[dc.WorkspaceFolder] = api.clock.Now("recreate", "successTimes") + api.knownDevcontainers[dc.WorkspaceFolder] = dc + api.mu.Unlock() - w.WriteHeader(http.StatusNoContent) + // Ensure an immediate refresh to accurately reflect the + // devcontainer state after recreation. + if err := api.refreshContainers(ctx); err != nil { + logger.Error(ctx, "failed to trigger immediate refresh after devcontainer recreation", slog.Error(err)) + } } // handleDevcontainersList handles the HTTP request to list known devcontainers. @@ -626,7 +718,10 @@ func (api *API) handleDevcontainersList(w http.ResponseWriter, r *http.Request) api.mu.RLock() err := api.containersErr - devcontainers := slices.Clone(api.knownDevcontainers) + devcontainers := make([]codersdk.WorkspaceAgentDevcontainer, 0, len(api.knownDevcontainers)) + for _, dc := range api.knownDevcontainers { + devcontainers = append(devcontainers, dc) + } api.mu.RUnlock() if err != nil { httpapi.Write(ctx, w, http.StatusInternalServerError, codersdk.Response{ @@ -659,16 +754,16 @@ func (api *API) markDevcontainerDirty(configPath string, modifiedAt time.Time) { // Record the timestamp of when this configuration file was modified. api.configFileModifiedTimes[configPath] = modifiedAt - for i := range api.knownDevcontainers { - dc := &api.knownDevcontainers[i] + for _, dc := range api.knownDevcontainers { if dc.ConfigPath != configPath { continue } logger := api.logger.With( - slog.F("file", configPath), - slog.F("name", dc.Name), + slog.F("devcontainer_id", dc.ID), + slog.F("devcontainer_name", dc.Name), slog.F("workspace_folder", dc.WorkspaceFolder), + slog.F("file", configPath), slog.F("modified_at", modifiedAt), ) @@ -683,6 +778,8 @@ func (api *API) markDevcontainerDirty(configPath string, modifiedAt time.Time) { logger.Info(api.ctx, "marking devcontainer container as dirty") dc.Container.DevcontainerDirty = true } + + api.knownDevcontainers[dc.WorkspaceFolder] = dc } } @@ -692,17 +789,21 @@ func (api *API) Close() error { api.mu.Unlock() return nil } - api.closed = true - api.logger.Debug(api.ctx, "closing API") - defer api.logger.Debug(api.ctx, "closed API") + api.closed = true + api.cancel() // Interrupt all routines. + api.mu.Unlock() // Release lock before waiting for goroutines. - api.cancel() + // Close the watcher to ensure its loop finishes. err := api.watcher.Close() - api.mu.Unlock() + // Wait for loops to finish. <-api.watcherDone <-api.updaterDone + // Wait for all devcontainer recreation tasks to complete. + api.recreateWg.Wait() + + api.logger.Debug(api.ctx, "closed API") return err } diff --git a/agent/agentcontainers/api_test.go b/agent/agentcontainers/api_test.go index a687cb8c001f8..826e7a5030f23 100644 --- a/agent/agentcontainers/api_test.go +++ b/agent/agentcontainers/api_test.go @@ -42,11 +42,19 @@ func (f *fakeLister) List(_ context.Context) (codersdk.WorkspaceAgentListContain // fakeDevcontainerCLI implements the agentcontainers.DevcontainerCLI // interface for testing. type fakeDevcontainerCLI struct { - id string - err error + id string + err error + continueUp chan struct{} } -func (f *fakeDevcontainerCLI) Up(_ context.Context, _, _ string, _ ...agentcontainers.DevcontainerCLIUpOptions) (string, error) { +func (f *fakeDevcontainerCLI) Up(ctx context.Context, _, _ string, _ ...agentcontainers.DevcontainerCLIUpOptions) (string, error) { + if f.continueUp != nil { + select { + case <-ctx.Done(): + return "", xerrors.New("test timeout") + case <-f.continueUp: + } + } return f.id, f.err } @@ -302,6 +310,7 @@ func TestAPI(t *testing.T) { validContainer := codersdk.WorkspaceAgentContainer{ ID: "container-id", FriendlyName: "container-name", + Running: true, Labels: map[string]string{ agentcontainers.DevcontainerLocalFolderLabel: "/workspace", agentcontainers.DevcontainerConfigFileLabel: "/workspace/.devcontainer/devcontainer.json", @@ -319,16 +328,16 @@ func TestAPI(t *testing.T) { containerID string lister *fakeLister devcontainerCLI *fakeDevcontainerCLI - wantStatus int - wantBody string + wantStatus []int + wantBody []string }{ { name: "Missing container ID", containerID: "", lister: &fakeLister{}, devcontainerCLI: &fakeDevcontainerCLI{}, - wantStatus: http.StatusBadRequest, - wantBody: "Missing container ID or name", + wantStatus: []int{http.StatusBadRequest}, + wantBody: []string{"Missing container ID or name"}, }, { name: "List error", @@ -337,8 +346,8 @@ func TestAPI(t *testing.T) { err: xerrors.New("list error"), }, devcontainerCLI: &fakeDevcontainerCLI{}, - wantStatus: http.StatusInternalServerError, - wantBody: "Could not list containers", + wantStatus: []int{http.StatusInternalServerError}, + wantBody: []string{"Could not list containers"}, }, { name: "Container not found", @@ -349,8 +358,8 @@ func TestAPI(t *testing.T) { }, }, devcontainerCLI: &fakeDevcontainerCLI{}, - wantStatus: http.StatusNotFound, - wantBody: "Container not found", + wantStatus: []int{http.StatusNotFound}, + wantBody: []string{"Container not found"}, }, { name: "Missing workspace folder label", @@ -361,8 +370,8 @@ func TestAPI(t *testing.T) { }, }, devcontainerCLI: &fakeDevcontainerCLI{}, - wantStatus: http.StatusBadRequest, - wantBody: "Missing workspace folder label", + wantStatus: []int{http.StatusBadRequest}, + wantBody: []string{"Missing workspace folder label"}, }, { name: "Devcontainer CLI error", @@ -375,8 +384,8 @@ func TestAPI(t *testing.T) { devcontainerCLI: &fakeDevcontainerCLI{ err: xerrors.New("devcontainer CLI error"), }, - wantStatus: http.StatusInternalServerError, - wantBody: "Could not recreate devcontainer", + wantStatus: []int{http.StatusAccepted, http.StatusConflict}, + wantBody: []string{"Devcontainer recreation initiated", "Devcontainer recreation already in progress"}, }, { name: "OK", @@ -387,21 +396,33 @@ func TestAPI(t *testing.T) { }, }, devcontainerCLI: &fakeDevcontainerCLI{}, - wantStatus: http.StatusNoContent, - wantBody: "", + wantStatus: []int{http.StatusAccepted, http.StatusConflict}, + wantBody: []string{"Devcontainer recreation initiated", "Devcontainer recreation already in progress"}, }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { t.Parallel() + require.GreaterOrEqual(t, len(tt.wantStatus), 1, "developer error: at least one status code expected") + require.Len(t, tt.wantStatus, len(tt.wantBody), "developer error: status and body length mismatch") + + ctx := testutil.Context(t, testutil.WaitShort) logger := slogtest.Make(t, &slogtest.Options{IgnoreErrors: true}).Leveled(slog.LevelDebug) + mClock := quartz.NewMock(t) + mClock.Set(time.Now()).MustWait(ctx) + tickerTrap := mClock.Trap().TickerFunc("updaterLoop") + nowRecreateErrorTrap := mClock.Trap().Now("recreate", "errorTimes") + nowRecreateSuccessTrap := mClock.Trap().Now("recreate", "successTimes") + + tt.devcontainerCLI.continueUp = make(chan struct{}) // Setup router with the handler under test. r := chi.NewRouter() api := agentcontainers.NewAPI( logger, + agentcontainers.WithClock(mClock), agentcontainers.WithLister(tt.lister), agentcontainers.WithDevcontainerCLI(tt.devcontainerCLI), agentcontainers.WithWatcher(watcher.NewNoop()), @@ -409,21 +430,102 @@ func TestAPI(t *testing.T) { defer api.Close() r.Mount("/", api.Routes()) - ctx := testutil.Context(t, testutil.WaitShort) + // Make sure the ticker function has been registered + // before advancing the clock. + tickerTrap.MustWait(ctx).Release() + tickerTrap.Close() - // Simulate HTTP request to the recreate endpoint. - req := httptest.NewRequest(http.MethodPost, "/devcontainers/container/"+tt.containerID+"/recreate", nil). + for i := range tt.wantStatus { + // Simulate HTTP request to the recreate endpoint. + req := httptest.NewRequest(http.MethodPost, "/devcontainers/container/"+tt.containerID+"/recreate", nil). + WithContext(ctx) + rec := httptest.NewRecorder() + r.ServeHTTP(rec, req) + + // Check the response status code and body. + require.Equal(t, tt.wantStatus[i], rec.Code, "status code mismatch") + if tt.wantBody[i] != "" { + assert.Contains(t, rec.Body.String(), tt.wantBody[i], "response body mismatch") + } + } + + // Error tests are simple, but the remainder of this test is a + // bit more involved, closer to an integration test. That is + // because we must check what state the devcontainer ends up in + // after the recreation process is initiated and finished. + if tt.wantStatus[0] != http.StatusAccepted { + close(tt.devcontainerCLI.continueUp) + nowRecreateSuccessTrap.Close() + nowRecreateErrorTrap.Close() + return + } + + _, aw := mClock.AdvanceNext() + aw.MustWait(ctx) + + // Verify the devcontainer is in starting state after recreation + // request is made. + req := httptest.NewRequest(http.MethodGet, "/devcontainers", nil). WithContext(ctx) rec := httptest.NewRecorder() r.ServeHTTP(rec, req) - // Check the response status code and body. - require.Equal(t, tt.wantStatus, rec.Code, "status code mismatch") - if tt.wantBody != "" { - assert.Contains(t, rec.Body.String(), tt.wantBody, "response body mismatch") - } else if tt.wantStatus == http.StatusNoContent { - assert.Empty(t, rec.Body.String(), "expected empty response body") + require.Equal(t, http.StatusOK, rec.Code, "status code mismatch") + var resp codersdk.WorkspaceAgentDevcontainersResponse + t.Log(rec.Body.String()) + err := json.NewDecoder(rec.Body).Decode(&resp) + require.NoError(t, err, "unmarshal response failed") + require.Len(t, resp.Devcontainers, 1, "expected one devcontainer in response") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusStarting, resp.Devcontainers[0].Status, "devcontainer is not starting") + + // Allow the devcontainer CLI to continue the up process. + close(tt.devcontainerCLI.continueUp) + + // Ensure the devcontainer ends up in error state if the up call fails. + if tt.devcontainerCLI.err != nil { + nowRecreateSuccessTrap.Close() + // The timestamp for the error will be stored, which gives + // us a good anchor point to know when to do our request. + nowRecreateErrorTrap.MustWait(ctx).Release() + nowRecreateErrorTrap.Close() + + // Advance the clock to run the devcontainer state update routine. + _, aw = mClock.AdvanceNext() + aw.MustWait(ctx) + + req = httptest.NewRequest(http.MethodGet, "/devcontainers", nil). + WithContext(ctx) + rec = httptest.NewRecorder() + r.ServeHTTP(rec, req) + + require.Equal(t, http.StatusOK, rec.Code, "status code mismatch after error") + err = json.NewDecoder(rec.Body).Decode(&resp) + require.NoError(t, err, "unmarshal response failed after error") + require.Len(t, resp.Devcontainers, 1, "expected one devcontainer in response after error") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusError, resp.Devcontainers[0].Status, "devcontainer is not in an error state after up failure") + return } + + // Ensure the devcontainer ends up in success state. + nowRecreateSuccessTrap.MustWait(ctx).Release() + nowRecreateSuccessTrap.Close() + + // Advance the clock to run the devcontainer state update routine. + _, aw = mClock.AdvanceNext() + aw.MustWait(ctx) + + req = httptest.NewRequest(http.MethodGet, "/devcontainers", nil). + WithContext(ctx) + rec = httptest.NewRecorder() + r.ServeHTTP(rec, req) + + // Check the response status code and body after recreation. + require.Equal(t, http.StatusOK, rec.Code, "status code mismatch after recreation") + t.Log(rec.Body.String()) + err = json.NewDecoder(rec.Body).Decode(&resp) + require.NoError(t, err, "unmarshal response failed after recreation") + require.Len(t, resp.Devcontainers, 1, "expected one devcontainer in response after recreation") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, resp.Devcontainers[0].Status, "devcontainer is not stopped after recreation") }) } }) @@ -482,7 +584,7 @@ func TestAPI(t *testing.T) { wantCount: 2, verify: func(t *testing.T, devcontainers []codersdk.WorkspaceAgentDevcontainer) { for _, dc := range devcontainers { - assert.False(t, dc.Running, "devcontainer should not be running") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusStopped, dc.Status, "devcontainer should be stopped") assert.Nil(t, dc.Container, "devcontainer should not have container reference") } }, @@ -515,7 +617,7 @@ func TestAPI(t *testing.T) { verify: func(t *testing.T, devcontainers []codersdk.WorkspaceAgentDevcontainer) { dc := devcontainers[0] assert.Equal(t, "/workspace/runtime1", dc.WorkspaceFolder) - assert.True(t, dc.Running) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, dc.Status) require.NotNil(t, dc.Container) assert.Equal(t, "runtime-container-1", dc.Container.ID) }, @@ -554,9 +656,9 @@ func TestAPI(t *testing.T) { known2 := mustFindDevcontainerByPath(t, devcontainers, "/workspace/known2") runtime1 := mustFindDevcontainerByPath(t, devcontainers, "/workspace/runtime1") - assert.True(t, known1.Running) - assert.False(t, known2.Running) - assert.True(t, runtime1.Running) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, known1.Status) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusStopped, known2.Status) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, runtime1.Status) require.NotNil(t, known1.Container) assert.Nil(t, known2.Container) @@ -598,8 +700,8 @@ func TestAPI(t *testing.T) { running := mustFindDevcontainerByPath(t, devcontainers, "/workspace/running") nonRunning := mustFindDevcontainerByPath(t, devcontainers, "/workspace/non-running") - assert.True(t, running.Running) - assert.False(t, nonRunning.Running) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, running.Status) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusStopped, nonRunning.Status) require.NotNil(t, running.Container, "running container should have container reference") require.NotNil(t, nonRunning.Container, "non-running container should have container reference") @@ -637,7 +739,7 @@ func TestAPI(t *testing.T) { } } require.NotNil(t, dc2, "missing devcontainer with ID %s", knownDevcontainerID2) - assert.True(t, dc2.Running) + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, dc2.Status) assert.NotEmpty(t, dc2.ConfigPath) require.NotNil(t, dc2.Container) assert.Equal(t, "known-container-2", dc2.Container.ID) @@ -780,6 +882,7 @@ func TestAPI(t *testing.T) { Name: "test-devcontainer", WorkspaceFolder: "/home/coder/project", ConfigPath: "/home/coder/project/.devcontainer/devcontainer.json", + Status: codersdk.WorkspaceAgentDevcontainerStatusRunning, // Corrected enum } ctx := testutil.Context(t, testutil.WaitShort) @@ -831,7 +934,7 @@ func TestAPI(t *testing.T) { err := json.NewDecoder(rec.Body).Decode(&resp1) require.NoError(t, err) require.Len(t, resp1.Devcontainers, 1) - require.True(t, resp1.Devcontainers[0].Running, "devcontainer should be running initially") + require.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, resp1.Devcontainers[0].Status, "devcontainer should be running initially") require.True(t, resp1.Devcontainers[0].Dirty, "devcontainer should be dirty initially") require.NotNil(t, resp1.Devcontainers[0].Container, "devcontainer should have a container initially") @@ -855,7 +958,7 @@ func TestAPI(t *testing.T) { err = json.NewDecoder(rec.Body).Decode(&resp2) require.NoError(t, err) require.Len(t, resp2.Devcontainers, 1) - require.False(t, resp2.Devcontainers[0].Running, "devcontainer should not be running after empty list") + require.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusStopped, resp2.Devcontainers[0].Status, "devcontainer should not be running after empty list") require.False(t, resp2.Devcontainers[0].Dirty, "devcontainer should not be dirty after empty list") require.Nil(t, resp2.Devcontainers[0].Container, "devcontainer should not have a container after empty list") }) @@ -921,6 +1024,7 @@ func TestAPI(t *testing.T) { require.Len(t, response.Devcontainers, 1) assert.False(t, response.Devcontainers[0].Dirty, "devcontainer should not be marked as dirty initially") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, response.Devcontainers[0].Status, "devcontainer should be running initially") require.NotNil(t, response.Devcontainers[0].Container, "container should not be nil") assert.False(t, response.Devcontainers[0].Container.DevcontainerDirty, "container should not be marked as dirty initially") @@ -955,6 +1059,7 @@ func TestAPI(t *testing.T) { require.Len(t, response.Devcontainers, 1) assert.True(t, response.Devcontainers[0].Dirty, "container should be marked as dirty after config file was modified") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, response.Devcontainers[0].Status, "devcontainer should be running after config file was modified") require.NotNil(t, response.Devcontainers[0].Container, "container should not be nil") assert.True(t, response.Devcontainers[0].Container.DevcontainerDirty, "container should be marked as dirty after config file was modified") @@ -980,6 +1085,7 @@ func TestAPI(t *testing.T) { require.Len(t, response.Devcontainers, 1) assert.False(t, response.Devcontainers[0].Dirty, "dirty flag should be cleared on the devcontainer after container recreation") + assert.Equal(t, codersdk.WorkspaceAgentDevcontainerStatusRunning, response.Devcontainers[0].Status, "devcontainer should be running after recreation") require.NotNil(t, response.Devcontainers[0].Container, "container should not be nil") assert.False(t, response.Devcontainers[0].Container.DevcontainerDirty, "dirty flag should be cleared on the container after container recreation") diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index 7cee63e183e7e..b6a00051bba77 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -8613,6 +8613,9 @@ const docTemplate = `{ "CoderSessionToken": [] } ], + "produces": [ + "application/json" + ], "tags": [ "Agents" ], @@ -8636,8 +8639,11 @@ const docTemplate = `{ } ], "responses": { - "204": { - "description": "No Content" + "202": { + "description": "Accepted", + "schema": { + "$ref": "#/definitions/codersdk.Response" + } } } } diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index 89a582091496f..e5fdca7025089 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -7612,6 +7612,7 @@ "CoderSessionToken": [] } ], + "produces": ["application/json"], "tags": ["Agents"], "summary": "Recreate devcontainer for workspace agent", "operationId": "recreate-devcontainer-for-workspace-agent", @@ -7633,8 +7634,11 @@ } ], "responses": { - "204": { - "description": "No Content" + "202": { + "description": "Accepted", + "schema": { + "$ref": "#/definitions/codersdk.Response" + } } } } diff --git a/coderd/workspaceagents.go b/coderd/workspaceagents.go index 8b94566e75715..5a8adab6132c5 100644 --- a/coderd/workspaceagents.go +++ b/coderd/workspaceagents.go @@ -898,9 +898,10 @@ func (api *API) workspaceAgentListContainers(rw http.ResponseWriter, r *http.Req // @ID recreate-devcontainer-for-workspace-agent // @Security CoderSessionToken // @Tags Agents +// @Produce json // @Param workspaceagent path string true "Workspace agent ID" format(uuid) // @Param container path string true "Container ID or name" -// @Success 204 +// @Success 202 {object} codersdk.Response // @Router /workspaceagents/{workspaceagent}/containers/devcontainers/container/{container}/recreate [post] func (api *API) workspaceAgentRecreateDevcontainer(rw http.ResponseWriter, r *http.Request) { ctx := r.Context() diff --git a/codersdk/workspaceagents.go b/codersdk/workspaceagents.go index 37048c6c4fcfe..0c5aaddf913da 100644 --- a/codersdk/workspaceagents.go +++ b/codersdk/workspaceagents.go @@ -399,6 +399,17 @@ type WorkspaceAgentDevcontainersResponse struct { Devcontainers []WorkspaceAgentDevcontainer `json:"devcontainers"` } +// WorkspaceAgentDevcontainerStatus is the status of a devcontainer. +type WorkspaceAgentDevcontainerStatus string + +// WorkspaceAgentDevcontainerStatus enums. +const ( + WorkspaceAgentDevcontainerStatusRunning WorkspaceAgentDevcontainerStatus = "running" + WorkspaceAgentDevcontainerStatusStopped WorkspaceAgentDevcontainerStatus = "stopped" + WorkspaceAgentDevcontainerStatusStarting WorkspaceAgentDevcontainerStatus = "starting" + WorkspaceAgentDevcontainerStatusError WorkspaceAgentDevcontainerStatus = "error" +) + // WorkspaceAgentDevcontainer defines the location of a devcontainer // configuration in a workspace that is visible to the workspace agent. type WorkspaceAgentDevcontainer struct { @@ -408,9 +419,9 @@ type WorkspaceAgentDevcontainer struct { ConfigPath string `json:"config_path,omitempty"` // Additional runtime fields. - Running bool `json:"running"` - Dirty bool `json:"dirty"` - Container *WorkspaceAgentContainer `json:"container,omitempty"` + Status WorkspaceAgentDevcontainerStatus `json:"status"` + Dirty bool `json:"dirty"` + Container *WorkspaceAgentContainer `json:"container,omitempty"` } // WorkspaceAgentContainer describes a devcontainer of some sort diff --git a/codersdk/workspacesdk/agentconn.go b/codersdk/workspacesdk/agentconn.go index f3c68d38b5575..c9e9824e2950f 100644 --- a/codersdk/workspacesdk/agentconn.go +++ b/codersdk/workspacesdk/agentconn.go @@ -397,7 +397,7 @@ func (c *AgentConn) RecreateDevcontainer(ctx context.Context, containerIDOrName return xerrors.Errorf("do request: %w", err) } defer res.Body.Close() - if res.StatusCode != http.StatusNoContent { + if res.StatusCode != http.StatusAccepted { return codersdk.ReadBodyAsError(res) } return nil diff --git a/docs/reference/api/agents.md b/docs/reference/api/agents.md index f126fec59978c..fd0cd38d355e0 100644 --- a/docs/reference/api/agents.md +++ b/docs/reference/api/agents.md @@ -821,6 +821,7 @@ To perform this operation, you must be authenticated. [Learn more](authenticatio ```shell # Example request using curl curl -X POST http://coder-server:8080/api/v2/workspaceagents/{workspaceagent}/containers/devcontainers/container/{container}/recreate \ + -H 'Accept: application/json' \ -H 'Coder-Session-Token: API_KEY' ``` @@ -833,11 +834,28 @@ curl -X POST http://coder-server:8080/api/v2/workspaceagents/{workspaceagent}/co | `workspaceagent` | path | string(uuid) | true | Workspace agent ID | | `container` | path | string | true | Container ID or name | +### Example responses + +> 202 Response + +```json +{ + "detail": "string", + "message": "string", + "validations": [ + { + "detail": "string", + "field": "string" + } + ] +} +``` + ### Responses -| Status | Meaning | Description | Schema | -|--------|-----------------------------------------------------------------|-------------|--------| -| 204 | [No Content](https://tools.ietf.org/html/rfc7231#section-6.3.5) | No Content | | +| Status | Meaning | Description | Schema | +|--------|---------------------------------------------------------------|-------------|--------------------------------------------------| +| 202 | [Accepted](https://tools.ietf.org/html/rfc7231#section-6.3.3) | Accepted | [codersdk.Response](schemas.md#codersdkresponse) | To perform this operation, you must be authenticated. [Learn more](authentication.md). diff --git a/enterprise/coderd/prebuilds/metricscollector.go b/enterprise/coderd/prebuilds/metricscollector.go index 7a7734b6f8093..90257c26dd580 100644 --- a/enterprise/coderd/prebuilds/metricscollector.go +++ b/enterprise/coderd/prebuilds/metricscollector.go @@ -27,6 +27,7 @@ const ( MetricDesiredGauge = namespace + "desired" MetricRunningGauge = namespace + "running" MetricEligibleGauge = namespace + "eligible" + MetricPresetHardLimitedGauge = namespace + "preset_hard_limited" MetricLastUpdatedGauge = namespace + "metrics_last_updated" ) @@ -82,6 +83,12 @@ var ( labels, nil, ) + presetHardLimitedDesc = prometheus.NewDesc( + MetricPresetHardLimitedGauge, + "Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.", + labels, + nil, + ) lastUpdateDesc = prometheus.NewDesc( MetricLastUpdatedGauge, "The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.", @@ -104,17 +111,22 @@ type MetricsCollector struct { replacementsCounter map[replacementKey]float64 replacementsCounterMu sync.Mutex + + isPresetHardLimited map[hardLimitedPresetKey]bool + isPresetHardLimitedMu sync.Mutex } var _ prometheus.Collector = new(MetricsCollector) func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector { log := logger.Named("prebuilds_metrics_collector") + return &MetricsCollector{ database: db, logger: log, snapshotter: snapshotter, replacementsCounter: make(map[replacementKey]float64), + isPresetHardLimited: make(map[hardLimitedPresetKey]bool), } } @@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) { descCh <- desiredPrebuildsDesc descCh <- runningPrebuildsDesc descCh <- eligiblePrebuildsDesc + descCh <- presetHardLimitedDesc descCh <- lastUpdateDesc } @@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) { metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName) } + mc.isPresetHardLimitedMu.Lock() + for key, isHardLimited := range mc.isPresetHardLimited { + var val float64 + if isHardLimited { + val = 1 + } + + metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName) + } + mc.isPresetHardLimitedMu.Unlock() + metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix())) } @@ -247,3 +271,25 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres // cause an issue (or indeed if either would), so we just track the replacement. mc.replacementsCounter[key]++ } + +type hardLimitedPresetKey struct { + orgName, templateName, presetName string +} + +func (k hardLimitedPresetKey) String() string { + return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName) +} + +// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus. +func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) { + mc.isPresetHardLimitedMu.Lock() + defer mc.isPresetHardLimitedMu.Unlock() + + key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName} + + if isHardLimited { + mc.isPresetHardLimited[key] = true + } else { + delete(mc.isPresetHardLimited, key) + } +} diff --git a/enterprise/coderd/prebuilds/reconcile.go b/enterprise/coderd/prebuilds/reconcile.go index 7796e43777951..603c0da3588a6 100644 --- a/enterprise/coderd/prebuilds/reconcile.go +++ b/enterprise/coderd/prebuilds/reconcile.go @@ -361,17 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres slog.F("preset_name", ps.Preset.Name), ) - // If the preset was previously hard-limited, log it and exit early. - if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited { - logger.Warn(ctx, "skipping hard limited preset") - return nil - } + // Report a preset as hard-limited only if all the following conditions are met: + // - The preset is marked as hard-limited + // - The preset is using the active version of its template, and the template has not been deleted + // + // The second condition is important because a hard-limited preset that has become outdated is no longer relevant. + // Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it + // as hard-limited to the admin. + reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted + c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited) // If the preset reached the hard failure limit for the first time during this iteration: // - Mark it as hard-limited in the database // - Send notifications to template admins - if ps.IsHardLimited { - logger.Warn(ctx, "skipping hard limited preset") + // - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed. + if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited { + logger.Warn(ctx, "preset is hard limited, notifying template admins") err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{ Status: database.PrebuildStatusHardLimited, @@ -384,10 +389,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres err = c.notifyPrebuildFailureLimitReached(ctx, ps) if err != nil { logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err)) - return nil } - - return nil } state := ps.CalculateState() @@ -452,6 +454,13 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres actions.Create = desired } + // If preset is hard-limited, and it's a create operation, log it and exit early. + // Creation operation is disallowed for hard-limited preset. + if ps.IsHardLimited && actions.Create > 0 { + logger.Warn(ctx, "skipping hard limited preset for create operation") + return nil + } + var multiErr multierror.Error for range actions.Create { diff --git a/enterprise/coderd/prebuilds/reconcile_test.go b/enterprise/coderd/prebuilds/reconcile_test.go index f52a77ca500b9..b7f5974197150 100644 --- a/enterprise/coderd/prebuilds/reconcile_test.go +++ b/enterprise/coderd/prebuilds/reconcile_test.go @@ -696,7 +696,8 @@ func TestSkippingHardLimitedPresets(t *testing.T) { ).Leveled(slog.LevelDebug) db, pubSub := dbtestutil.NewDB(t) fakeEnqueuer := newFakeEnqueuer() - controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, prometheus.NewRegistry(), fakeEnqueuer) + registry := prometheus.NewRegistry() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer) // Template admin to receive a notification. templateAdmin := dbgen.User(t, db, database.User{ @@ -732,6 +733,17 @@ func TestSkippingHardLimitedPresets(t *testing.T) { workspaceCount := len(workspaces) require.Equal(t, 1, workspaceCount) + // Verify initial state: metric is not set - meaning preset is not hard limited. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. clock.Advance(time.Nanosecond).MustWait(ctx) @@ -755,6 +767,16 @@ func TestSkippingHardLimitedPresets(t *testing.T) { // When hard limit is not reached, a new workspace should be created. require.Equal(t, 2, len(workspaces)) require.Equal(t, database.PrebuildStatusHealthy, updatedPreset.PrebuildStatus) + + // When hard limit is not reached, metric is not set. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) return } @@ -775,6 +797,252 @@ func TestSkippingHardLimitedPresets(t *testing.T) { return true }) require.Len(t, matching, 1) + + // When hard limit is reached, metric is set to 1. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.NotNil(t, metric) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + }) + } +} + +func TestHardLimitedPresetShouldNotBlockDeletion(t *testing.T) { + t.Parallel() + + if !dbtestutil.WillUsePostgres() { + t.Skip("This test requires postgres") + } + + // Test cases verify the behavior of prebuild creation depending on configured failure limits. + testCases := []struct { + name string + hardLimit int64 + createNewTemplateVersion bool + deleteTemplate bool + }{ + { + // hard limit is hit - but we allow deletion of prebuilt workspace because it's outdated (new template version was created) + name: "new template version is created", + hardLimit: 1, + createNewTemplateVersion: true, + deleteTemplate: false, + }, + { + // hard limit is hit - but we allow deletion of prebuilt workspace because template is deleted + name: "template is deleted", + hardLimit: 1, + createNewTemplateVersion: false, + deleteTemplate: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + clock := quartz.NewMock(t) + ctx := testutil.Context(t, testutil.WaitShort) + cfg := codersdk.PrebuildsConfig{ + FailureHardLimit: serpent.Int64(tc.hardLimit), + ReconciliationBackoffInterval: 0, + } + logger := slogtest.Make( + t, &slogtest.Options{IgnoreErrors: true}, + ).Leveled(slog.LevelDebug) + db, pubSub := dbtestutil.NewDB(t) + fakeEnqueuer := newFakeEnqueuer() + registry := prometheus.NewRegistry() + controller := prebuilds.NewStoreReconciler(db, pubSub, cfg, logger, clock, registry, fakeEnqueuer) + + // Template admin to receive a notification. + templateAdmin := dbgen.User(t, db, database.User{ + RBACRoles: []string{codersdk.RoleTemplateAdmin}, + }) + + // Set up test environment with a template, version, and preset. + ownerID := uuid.New() + dbgen.User(t, db, database.User{ + ID: ownerID, + }) + org, template := setupTestDBTemplate(t, db, ownerID, false) + templateVersionID := setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + preset := setupTestDBPreset(t, db, templateVersionID, 2, uuid.New().String()) + + // Create a successful prebuilt workspace. + successfulWorkspace, _ := setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusSucceeded, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + // Make sure that prebuilt workspaces created in such order: [successful, failed]. + clock.Advance(time.Second).MustWait(ctx) + + // Create a failed prebuilt workspace that counts toward the hard failure limit. + setupTestDBPrebuild( + t, + clock, + db, + pubSub, + database.WorkspaceTransitionStart, + database.ProvisionerJobStatusFailed, + org.ID, + preset, + template.ID, + templateVersionID, + ) + + getJobStatusMap := func(workspaces []database.WorkspaceTable) map[database.ProvisionerJobStatus]int { + jobStatusMap := make(map[database.ProvisionerJobStatus]int) + for _, workspace := range workspaces { + workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: workspace.ID, + }) + require.NoError(t, err) + + for _, workspaceBuild := range workspaceBuilds { + job, err := db.GetProvisionerJobByID(ctx, workspaceBuild.JobID) + require.NoError(t, err) + jobStatusMap[job.JobStatus]++ + } + } + return jobStatusMap + } + + // Verify initial state: two workspaces exist, one successful, one failed. + workspaces, err := db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + jobStatusMap := getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 2) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + + // Verify initial state: metric is not set - meaning preset is not hard limited. + require.NoError(t, controller.ForceMetricsUpdate(ctx)) + mf, err := registry.Gather() + require.NoError(t, err) + metric := findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) + + // We simulate a failed prebuild in the test; Consequently, the backoff mechanism is triggered when ReconcileAll is called. + // Even though ReconciliationBackoffInterval is set to zero, we still need to advance the clock by at least one nanosecond. + clock.Advance(time.Nanosecond).MustWait(ctx) + + // Trigger reconciliation to attempt creating a new prebuild. + // The outcome depends on whether the hard limit has been reached. + require.NoError(t, controller.ReconcileAll(ctx)) + + // These two additional calls to ReconcileAll should not trigger any notifications. + // A notification is only sent once. + require.NoError(t, controller.ReconcileAll(ctx)) + require.NoError(t, controller.ReconcileAll(ctx)) + + // Verify the final state after reconciliation. + // When hard limit is reached, no new workspace should be created. + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + jobStatusMap = getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 2) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + + updatedPreset, err := db.GetPresetByID(ctx, preset.ID) + require.NoError(t, err) + require.Equal(t, database.PrebuildStatusHardLimited, updatedPreset.PrebuildStatus) + + // When hard limit is reached, a notification should be sent. + matching := fakeEnqueuer.Sent(func(notification *notificationstest.FakeNotification) bool { + if !assert.Equal(t, notifications.PrebuildFailureLimitReached, notification.TemplateID, "unexpected template") { + return false + } + + if !assert.Equal(t, templateAdmin.ID, notification.UserID, "unexpected receiver") { + return false + } + + return true + }) + require.Len(t, matching, 1) + + // When hard limit is reached, metric is set to 1. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.NotNil(t, metric) + require.NotNil(t, metric.GetGauge()) + require.EqualValues(t, 1, metric.GetGauge().GetValue()) + + if tc.createNewTemplateVersion { + // Create a new template version and mark it as active + // This marks the template version that we care about as inactive + setupTestDBTemplateVersion(ctx, t, clock, db, pubSub, org.ID, ownerID, template.ID) + } + + if tc.deleteTemplate { + require.NoError(t, db.UpdateTemplateDeletedByID(ctx, database.UpdateTemplateDeletedByIDParams{ + ID: template.ID, + Deleted: true, + UpdatedAt: dbtime.Now(), + })) + } + + // Trigger reconciliation to make sure that successful, but outdated prebuilt workspace will be deleted. + require.NoError(t, controller.ReconcileAll(ctx)) + + workspaces, err = db.GetWorkspacesByTemplateID(ctx, template.ID) + require.NoError(t, err) + require.Equal(t, 2, len(workspaces)) + + jobStatusMap = getJobStatusMap(workspaces) + require.Len(t, jobStatusMap, 3) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusSucceeded]) + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusFailed]) + // Pending job should be the job that deletes successful, but outdated prebuilt workspace. + // Prebuilt workspace MUST be deleted, despite the fact that preset is marked as hard limited. + require.Equal(t, 1, jobStatusMap[database.ProvisionerJobStatusPending]) + + workspaceBuilds, err := db.GetWorkspaceBuildsByWorkspaceID(ctx, database.GetWorkspaceBuildsByWorkspaceIDParams{ + WorkspaceID: successfulWorkspace.ID, + }) + require.NoError(t, err) + require.Equal(t, 2, len(workspaceBuilds)) + // Make sure that successfully created, but outdated prebuilt workspace was scheduled for deletion. + require.Equal(t, database.WorkspaceTransitionDelete, workspaceBuilds[0].Transition) + require.Equal(t, database.WorkspaceTransitionStart, workspaceBuilds[1].Transition) + + // Metric is deleted after preset became outdated. + mf, err = registry.Gather() + require.NoError(t, err) + metric = findMetric(mf, prebuilds.MetricPresetHardLimitedGauge, map[string]string{ + "template_name": template.Name, + "preset_name": preset.Name, + "org_name": org.Name, + }) + require.Nil(t, metric) }) } } diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 74631c2be32fd..5125a554cacc1 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -3368,11 +3368,21 @@ export interface WorkspaceAgentDevcontainer { readonly name: string; readonly workspace_folder: string; readonly config_path?: string; - readonly running: boolean; + readonly status: WorkspaceAgentDevcontainerStatus; readonly dirty: boolean; readonly container?: WorkspaceAgentContainer; } +// From codersdk/workspaceagents.go +export type WorkspaceAgentDevcontainerStatus = + | "error" + | "running" + | "starting" + | "stopped"; + +export const WorkspaceAgentDevcontainerStatuses: WorkspaceAgentDevcontainerStatus[] = + ["error", "running", "starting", "stopped"]; + // From codersdk/workspaceagents.go export interface WorkspaceAgentDevcontainersResponse { readonly devcontainers: readonly WorkspaceAgentDevcontainer[];