From 8d4e67d3a07ff3e40d62139fcd71f8488aa31884 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Mon, 3 Apr 2023 21:43:18 +0200 Subject: [PATCH 01/40] WIP --- cli/server.go | 6 ++ coderd/prometheusmetrics/prometheusmetrics.go | 75 +++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/cli/server.go b/cli/server.go index b6fa7c31b647c..5ce53af185e1d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -868,6 +868,12 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, options.Database, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 536522bf73e04..2c1c36dddbc34 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -106,3 +106,78 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa }() return cancelFunc, nil } + +// Agents tracks the total number of workspaces with labels on status. +func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 15 * time.Second // TODO 5 * time.Minute + } + + agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection", + Help: "The agent connection with a status.", + }, []string{"agent_name", "workspace_name", "status"}) + err := registerer.Register(agentsConnectionGauge) + if err != nil { + return nil, err + } + + agentsUserLatenciesHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "user_latencies_seconds", + Help: "The user's agent latency in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }, []string{"agent_id", "workspace", "connection_type", "ide"}) + err = registerer.Register(agentsUserLatenciesHistogram) + if err != nil { + return nil, err + } + + ctx, cancelFunc := context.WithCancel(ctx) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + // FIXME Optimize this routine: SQL db calls + + builds, err := db.GetLatestWorkspaceBuilds(ctx) + if err != nil { + continue + } + + agentsConnectionGauge.Reset() + for _, build := range builds { + workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) + if err != nil { + continue + } + + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) + if err != nil { + continue + } + + if len(agents) == 0 { + continue + } + + for _, agent := range agents { + connectionStatus := agent.Status(6 * time.Second) + + // FIXME AgentInactiveDisconnectTimeout + agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + } + } + } + }() + return cancelFunc, nil +} From 9ad09b20f8e830c346eb3e41c4eaf4c9e7d29475 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Tue, 4 Apr 2023 19:47:27 +0200 Subject: [PATCH 02/40] WIP --- cli/server.go | 20 ++++--- coderd/prometheusmetrics/prometheusmetrics.go | 60 +++++++++++++++++-- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/cli/server.go b/cli/server.go index 30ceb06aefc46..b3961b52e4e53 100644 --- a/cli/server.go +++ b/cli/server.go @@ -849,6 +849,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer options.Telemetry.Close() } + databaseStoreWithoutAuth := options.Database + + // We use a separate coderAPICloser so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, coderAPICloser, err := newAPI(ctx, options) + if err != nil { + return xerrors.Errorf("create coder API: %w", err) + } + // This prevents the pprof import from being accidentally deleted. _ = pprof.Handler if cfg.Pprof.Enable { @@ -871,7 +881,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, options.Database, 0) + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0) if err != nil { return xerrors.Errorf("register agents prometheus metric: %w", err) } @@ -887,14 +897,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } - // We use a separate coderAPICloser so the Enterprise API - // can have it's own close functions. This is cleaner - // than abstracting the Coder API itself. - coderAPI, coderAPICloser, err := newAPI(ctx, options) - if err != nil { - return xerrors.Errorf("create coder API: %w", err) - } - client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 2c1c36dddbc34..5ce6fbd51f6c5 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,13 +2,20 @@ package prometheusmetrics import ( "context" + "fmt" + "log" + "strconv" + "strings" + "sync/atomic" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "tailscale.com/tailcfg" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/tailnet" ) // ActiveUsers tracks the number of users that have authenticated within the past hour. @@ -108,7 +115,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 15 * time.Second // TODO 5 * time.Minute } @@ -124,23 +131,26 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S return nil, err } - agentsUserLatenciesHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "user_latencies_seconds", Help: "The user's agent latency in seconds.", - Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, - }, []string{"agent_id", "workspace", "connection_type", "ide"}) - err = registerer.Register(agentsUserLatenciesHistogram) + }, []string{"agent_id", "workspace_name", "derp_region", "preferred"}) + err = registerer.Register(agentsUserLatenciesGauge) if err != nil { return nil, err } + // FIXME connection_type ide + ctx, cancelFunc := context.WithCancel(ctx) ticker := time.NewTicker(duration) go func() { defer ticker.Stop() for { + log.Println("Agents!!!") + select { case <-ctx.Done(): return @@ -151,18 +161,22 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S builds, err := db.GetLatestWorkspaceBuilds(ctx) if err != nil { + log.Println("1", err) continue } agentsConnectionGauge.Reset() + agentsUserLatenciesGauge.Reset() for _, build := range builds { workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) if err != nil { + log.Println("2", err) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) if err != nil { + log.Println("3", err) continue } @@ -170,11 +184,47 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S continue } + // FIXME publish workspace even if no agents + for _, agent := range agents { connectionStatus := agent.Status(6 * time.Second) // FIXME AgentInactiveDisconnectTimeout + log.Println("with value " + agent.Name) agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + + node := (*coordinator.Load()).Node(agent.ID) + if node != nil { + log.Println("coordinator") + + for rawRegion, latency := range node.DERPLatency { + log.Println(rawRegion, latency) + + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err) + } + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + log.Println(region, latency) + agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + } + } else { + log.Println("node is null") + } + + // FIXME publish agent even if DERP is missing + // FIXME IDE? } } } From 440657c16195948c17a72751193043ce44a15531 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 10:43:30 +0200 Subject: [PATCH 03/40] WIP --- coderd/prometheusmetrics/prometheusmetrics.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 5ce6fbd51f6c5..d1af83a4a4439 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -190,6 +190,8 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S connectionStatus := agent.Status(6 * time.Second) // FIXME AgentInactiveDisconnectTimeout + // ? connection_timeout_seconds + // obok latency lifecycle_state log.Println("with value " + agent.Name) agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) @@ -225,6 +227,7 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S // FIXME publish agent even if DERP is missing // FIXME IDE? + // FIXME agent connection zero } } } From 8764f8975d75ebb45d9e5996fac9b7509c953e33 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:08:27 +0200 Subject: [PATCH 04/40] Agents --- cli/server.go | 33 ++--- coderd/prometheusmetrics/prometheusmetrics.go | 133 ++++++++++-------- 2 files changed, 92 insertions(+), 74 deletions(-) diff --git a/cli/server.go b/cli/server.go index b3961b52e4e53..c93064f34c8ef 100644 --- a/cli/server.go +++ b/cli/server.go @@ -849,16 +849,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer options.Telemetry.Close() } - databaseStoreWithoutAuth := options.Database - - // We use a separate coderAPICloser so the Enterprise API - // can have it's own close functions. This is cleaner - // than abstracting the Coder API itself. - coderAPI, coderAPICloser, err := newAPI(ctx, options) - if err != nil { - return xerrors.Errorf("create coder API: %w", err) - } - // This prevents the pprof import from being accidentally deleted. _ = pprof.Handler if cfg.Pprof.Enable { @@ -881,12 +871,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0) - if err != nil { - return xerrors.Errorf("register agents prometheus metric: %w", err) - } - defer closeAgentsFunc() - //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), @@ -897,6 +881,23 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } + // We use a separate coderAPICloser so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, coderAPICloser, err := newAPI(ctx, options) + if err != nil { + return xerrors.Errorf("create coder API: %w", err) + } + + if cfg.Prometheus.Enable { + // Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API. + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + } + client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index d1af83a4a4439..31a0b712a1511 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -3,7 +3,6 @@ package prometheusmetrics import ( "context" "fmt" - "log" "strconv" "strings" "sync/atomic" @@ -13,8 +12,11 @@ import ( "github.com/prometheus/client_golang/prometheus" "tailscale.com/tailcfg" + "cdr.dev/slog" + "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" "github.com/coder/coder/tailnet" ) @@ -115,119 +117,134 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 15 * time.Second // TODO 5 * time.Minute } - agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", - Name: "connection", - Help: "The agent connection with a status.", - }, []string{"agent_name", "workspace_name", "status"}) - err := registerer.Register(agentsConnectionGauge) + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"}) + err := registerer.Register(workspaceAgentsGauge) if err != nil { return nil, err } - agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", - Name: "user_latencies_seconds", - Help: "The user's agent latency in seconds.", - }, []string{"agent_id", "workspace_name", "derp_region", "preferred"}) - err = registerer.Register(agentsUserLatenciesGauge) + Name: "connections", + Help: "Agent connections with statuses.", + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) + err = registerer.Register(agentsConnectionGauge) if err != nil { return nil, err } - // FIXME connection_type ide + agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection_latencies_seconds", + Help: "Agent connection latencies in seconds.", + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}) + err = registerer.Register(agentsConnectionLatenciesGauge) + if err != nil { + return nil, err + } - ctx, cancelFunc := context.WithCancel(ctx) + // nolint:gocritic // Prometheus must collect metrics for all Coder users. + ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) go func() { defer ticker.Stop() for { - log.Println("Agents!!!") - select { case <-ctx.Done(): return case <-ticker.C: } - // FIXME Optimize this routine: SQL db calls + logger.Info(ctx, "Collect agent metrics now") - builds, err := db.GetLatestWorkspaceBuilds(ctx) + workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ + AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), + }) if err != nil { - log.Println("1", err) + logger.Error(ctx, "can't get workspace rows", slog.Error(err)) continue } + workspaceAgentsGauge.Reset() agentsConnectionGauge.Reset() - agentsUserLatenciesGauge.Reset() - for _, build := range builds { - workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) + agentsConnectionLatenciesGauge.Reset() + + for _, workspace := range workspaceRows { + user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { - log.Println("2", err) + logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } - agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { - log.Println("3", err) + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } if len(agents) == 0 { + logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } - // FIXME publish workspace even if no agents - for _, agent := range agents { - connectionStatus := agent.Status(6 * time.Second) - - // FIXME AgentInactiveDisconnectTimeout - // ? connection_timeout_seconds - // obok latency lifecycle_state - log.Println("with value " + agent.Name) - agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + // Collect information about agents + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) + + tailnetNode := "unknown" if node != nil { - log.Println("coordinator") + tailnetNode = node.ID.String() + } - for rawRegion, latency := range node.DERPLatency { - log.Println(rawRegion, latency) + agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) - regionParts := strings.SplitN(rawRegion, "-", 2) - regionID, err := strconv.Atoi(regionParts[0]) - if err != nil { - continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err) - } - region, found := derpMap.Regions[regionID] - if !found { - // It's possible that a workspace agent is using an old DERPMap - // and reports regions that do not exist. If that's the case, - // report the region as unknown! - region = &tailcfg.DERPRegion{ - RegionID: regionID, - RegionName: fmt.Sprintf("Unnamed %d", regionID), - } - } + if node == nil { + logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name)) + continue + } - log.Println(region, latency) - agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion)) + continue } - } else { - log.Println("node is null") + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) } - // FIXME publish agent even if DERP is missing // FIXME IDE? - // FIXME agent connection zero + // FIXME connection_type ide } } } From 663b5d5d7a78fce6f6d99943fc30759684a36618 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:31:06 +0200 Subject: [PATCH 05/40] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 93 ++++++++++++------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 31a0b712a1511..4e4d828981bc6 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -122,24 +122,24 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis duration = 15 * time.Second // TODO 5 * time.Minute } - workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", }, []string{"username", "workspace_name"}) - err := registerer.Register(workspaceAgentsGauge) + err := registerer.Register(agentsGauge) if err != nil { return nil, err } - agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) - err = registerer.Register(agentsConnectionGauge) + err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err } @@ -155,6 +155,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "apps", + Help: "Agent applications with statuses.", + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"}) + err = registerer.Register(agentsAppsGauge) + if err != nil { + return nil, err + } + // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) @@ -167,7 +178,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis case <-ticker.C: } - logger.Info(ctx, "Collect agent metrics now") + logger.Debug(ctx, "Collect agent metrics now") workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), @@ -177,34 +188,35 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis continue } - workspaceAgentsGauge.Reset() - agentsConnectionGauge.Reset() + agentsGauge.Reset() + agentsConnectionsGauge.Reset() agentsConnectionLatenciesGauge.Reset() + agentsAppsGauge.Reset() for _, workspace := range workspaceRows { user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { - logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { - logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } if len(agents) == 0 { - logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } for _, agent := range agents { // Collect information about agents - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) @@ -214,37 +226,46 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis tailnetNode = node.ID.String() } - agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) + agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) if node == nil { - logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name)) + logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) continue - } + } else { + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err)) + continue + } - // Collect information about connection latencies - for rawRegion, latency := range node.DERPLatency { - regionParts := strings.SplitN(rawRegion, "-", 2) - regionID, err := strconv.Atoi(regionParts[0]) - if err != nil { - logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion)) - continue - } - region, found := derpMap.Regions[regionID] - if !found { - // It's possible that a workspace agent is using an old DERPMap - // and reports regions that do not exist. If that's the case, - // report the region as unknown! - region = &tailcfg.DERPRegion{ - RegionID: regionID, - RegionName: fmt.Sprintf("Unnamed %d", regionID), + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } } + + agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) } + } - agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + // Collect information about registered applications + apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) + if err != nil { + logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) + continue } - // FIXME IDE? - // FIXME connection_type ide + for _, app := range apps { + agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1) + } } } } From 63aff5ebf2f2c0695c1289fa557b9bd5fbfdb80f Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:32:49 +0200 Subject: [PATCH 06/40] 1min --- coderd/prometheusmetrics/prometheusmetrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 4e4d828981bc6..a9dc1933addc1 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -119,7 +119,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa // Agents tracks the total number of workspaces with labels on status. func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { - duration = 15 * time.Second // TODO 5 * time.Minute + duration = 1 * time.Minute } agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ From 390548159b6fc8cf9d9f10b02e276384ff974b7d Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:45:16 +0200 Subject: [PATCH 07/40] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index a9dc1933addc1..5dddd4d20e388 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -230,7 +230,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis if node == nil { logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) - continue } else { // Collect information about connection latencies for rawRegion, latency := range node.DERPLatency { From f8d6f464fe93d3f7799463be9c6e0c9c6c6f5135 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 13:35:45 +0200 Subject: [PATCH 08/40] WIP --- .../prometheusmetrics_test.go | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index a0b375ccf8622..7b828e0514fd8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -6,16 +6,22 @@ import ( "testing" "time" + "sync/atomic" + "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" ) @@ -239,3 +245,37 @@ func TestWorkspaces(t *testing.T) { }) } } + +func TestAgents(t *testing.T) { + t.Parallel() + + // given + db := dbfake.New() + + coordinator := tailnet.NewCoordinator() + coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} + coordinatorPtr.Store(&coordinator) + derpMap := tailnettest.RunDERPAndSTUN(t) + agentInactiveDisconnectTimeout := 1 * time.Hour + registry := prometheus.NewRegistry() + + // when + cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + t.Cleanup(cancel) + + // then + require.NoError(t, err) + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + if len(metrics) < 1 { + return false + } + + for _, metric := range metrics[0].Metric { + panic(metric) + } + return true + }, testutil.WaitShort, testutil.IntervalFast) +} From d487a77372e703abb9be64b38f4a8f8366bddf74 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:18:06 +0200 Subject: [PATCH 09/40] Test --- coderd/prometheusmetrics/prometheusmetrics.go | 4 +- .../prometheusmetrics_test.go | 83 +++++++++++++++++-- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 5dddd4d20e388..222809db00bf8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,6 +2,8 @@ package prometheusmetrics import ( "context" + "database/sql" + "errors" "fmt" "strconv" "strings" @@ -257,7 +259,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis // Collect information about registered applications apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) - if err != nil { + if err != nil && !errors.Is(err, sql.ErrNoRows) { logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) continue } diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 7b828e0514fd8..e7829d8a34a18 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -15,11 +15,14 @@ import ( "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" "github.com/coder/coder/tailnet" "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" @@ -249,14 +252,53 @@ func TestWorkspaces(t *testing.T) { func TestAgents(t *testing.T) { t.Parallel() - // given - db := dbfake.New() + // Build a sample workspace with test agent and fake application + client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + db := api.Database + + user := coderdtest.CreateFirstUser(t, client) + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + Id: uuid.NewString(), + Name: "testagent", + Directory: t.TempDir(), + Auth: &proto.Agent_Token{ + Token: uuid.NewString(), + }, + Apps: []*proto.App{ + { + Slug: "fake-app", + DisplayName: "Fake application", + SharingLevel: proto.AppSharingLevel_OWNER, + // Hopefully this IP and port doesn't exist. + Url: "http://127.1.0.1:65535", + }, + }, + }}, + }}, + }, + }, + }}, + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + // given coordinator := tailnet.NewCoordinator() coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} coordinatorPtr.Store(&coordinator) derpMap := tailnettest.RunDERPAndSTUN(t) - agentInactiveDisconnectTimeout := 1 * time.Hour + agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests registry := prometheus.NewRegistry() // when @@ -265,6 +307,10 @@ func TestAgents(t *testing.T) { // then require.NoError(t, err) + + var agentsUp bool + var agentsConnections bool + var agentsApps bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -273,9 +319,34 @@ func TestAgents(t *testing.T) { return false } - for _, metric := range metrics[0].Metric { - panic(metric) + for _, metric := range metrics { + switch metric.GetName() { + case "coderd_agents_up": + assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsUp = true + case "coderd_agents_connections": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state + assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node + assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsConnections = true + case "coderd_agents_apps": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name + assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health + assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsApps = true + default: + require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + } } - return true + return agentsUp && agentsConnections && agentsApps }, testutil.WaitShort, testutil.IntervalFast) } From 7acbaf09ac86a9179d75949588099fecb708b3ad Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:19:39 +0200 Subject: [PATCH 10/40] docs --- docs/admin/prometheus.md | 4 ++++ scripts/metricsdocgen/metrics | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f35ba5d1c5182..e6b23a12702f4 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -31,6 +31,10 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically | Name | Type | Description | Labels | | -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | | `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | | `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | | `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 50bbc87990dda..9a5fc20dff8e1 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -1,3 +1,23 @@ +# HELP coderd_agents_apps Agent applications with statuses. +# TYPE coderd_agents_apps gauge +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds. +# TYPE coderd_agents_connection_latencies_seconds gauge +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 +# HELP coderd_agents_connections Agent connections with statuses. +# TYPE coderd_agents_connections gauge +coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 +# HELP coderd_agents_up The number of active agents per workspace. +# TYPE coderd_agents_up gauge +coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 # HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds. # TYPE coderd_api_websocket_durations_seconds histogram coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0 From 7418779f7b202e2df48e0fbca70906488d6db9ca Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:26:14 +0200 Subject: [PATCH 11/40] fmt --- coderd/prometheusmetrics/prometheusmetrics_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e7829d8a34a18..f7c5fc689e937 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -3,11 +3,10 @@ package prometheusmetrics_test import ( "context" "database/sql" + "sync/atomic" "testing" "time" - "sync/atomic" - "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" From 3a8e4e6ddf9df8987858f8c30ac8c7f1d10c5495 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 11:52:01 +0200 Subject: [PATCH 12/40] Add timer to measure the metrics collection --- coderd/prometheusmetrics/prometheusmetrics.go | 18 ++- .../prometheusmetrics_test.go | 5 +- docs/admin/prometheus.md | 105 +++++++++--------- scripts/metricsdocgen/metrics | 16 +++ 4 files changed, 90 insertions(+), 54 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 222809db00bf8..94c47230a4579 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -168,6 +168,18 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agents_execution_seconds", + Help: "Histogram for duration of agents metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err = registerer.Register(metricsCollectorAgents) + if err != nil { + return nil, err + } + // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) @@ -180,7 +192,8 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis case <-ticker.C: } - logger.Debug(ctx, "Collect agent metrics now") + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgents) workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), @@ -269,6 +282,9 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } } + + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } }() return cancelFunc, nil diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index f7c5fc689e937..e765c5f2a1128 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -310,6 +310,7 @@ func TestAgents(t *testing.T) { var agentsUp bool var agentsConnections bool var agentsApps bool + var agentsExecutionInSeconds bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -342,10 +343,12 @@ func TestAgents(t *testing.T) { assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value agentsApps = true + case "coderd_prometheusmetrics_agents_execution_seconds": + agentsExecutionInSeconds = true default: require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return agentsUp && agentsConnections && agentsApps + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds }, testutil.WaitShort, testutil.IntervalFast) } diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index e6b23a12702f4..2898f8f4a469c 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,57 +29,58 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 9a5fc20dff8e1..7e598b17abe56 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -588,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files # HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status. # TYPE coderd_api_workspace_latest_build_total gauge coderd_api_workspace_latest_build_total{status="succeeded"} 1 +# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds. +# TYPE coderd_metrics_collector_agents_execution_seconds histogram +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2 +coderd_metrics_collector_agents_execution_seconds_sum 0.0592915 +coderd_metrics_collector_agents_execution_seconds_count 2 # HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds. # TYPE coderd_provisionerd_job_timings_seconds histogram coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0 From b5d0581caec093a09ae40e0477af35e095202e82 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 13:26:11 +0200 Subject: [PATCH 13/40] Use CachedGaugeVec --- coderd/prometheusmetrics/collector.go | 80 +++++++++++++++++++ coderd/prometheusmetrics/prometheusmetrics.go | 40 +++++----- 2 files changed, 100 insertions(+), 20 deletions(-) create mode 100644 coderd/prometheusmetrics/collector.go diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go new file mode 100644 index 0000000000000..b92f992a6a7ec --- /dev/null +++ b/coderd/prometheusmetrics/collector.go @@ -0,0 +1,80 @@ +package prometheusmetrics + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +type CachedGaugeVec struct { + m sync.Mutex + + gaugeVec *prometheus.GaugeVec + records []vectorRecord +} + +var _ prometheus.Collector = new(CachedGaugeVec) + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string +} + +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { + return &CachedGaugeVec{ + gaugeVec: gaugeVec, + } +} + +func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Describe(desc) +} + +func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Collect(ch) +} + +func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + v.m.Lock() + defer v.m.Unlock() + + v.records = append(v.records, vectorRecord{ + operation: operation, + value: value, + labelValues: labelValues, + }) +} + +func (v *CachedGaugeVec) Commit() { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Reset() + for _, record := range v.records { + g := v.gaugeVec.WithLabelValues(record.labelValues...) + switch record.operation { + case VectorOperationAdd: + g.Add(record.value) + case VectorOperationSet: + g.Set(record.value) + default: + panic("unsupported vector operation") + } + } + + v.records = nil +} diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 94c47230a4579..83e4af90d0765 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -124,45 +124,45 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis duration = 1 * time.Minute } - agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{"username", "workspace_name"}) + }, []string{"username", "workspace_name"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err } - agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", - }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})) err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err } - agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}) + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err } - agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "apps", Help: "Agent applications with statuses.", - }, []string{"agent_name", "username", "workspace_name", "app_name", "health"}) + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"})) err = registerer.Register(agentsAppsGauge) if err != nil { return nil, err @@ -203,35 +203,30 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis continue } - agentsGauge.Reset() - agentsConnectionsGauge.Reset() - agentsConnectionLatenciesGauge.Reset() - agentsAppsGauge.Reset() - for _, workspace := range workspaceRows { user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } if len(agents) == 0 { logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } for _, agent := range agents { // Collect information about agents - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name) connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) @@ -241,7 +236,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis tailnetNode = node.ID.String() } - agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) + agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode) if node == nil { logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) @@ -266,7 +261,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } - agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)) } } @@ -278,11 +273,16 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } for _, app := range apps { - agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1) + agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)) } } } + agentsGauge.Commit() + agentsConnectionsGauge.Commit() + agentsConnectionLatenciesGauge.Commit() + agentsAppsGauge.Commit() + logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } From e4d708b8f36214712989605f1589ee0f1b9dc613 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 14:40:36 +0200 Subject: [PATCH 14/40] Unit tests --- coderd/prometheusmetrics/collector_test.go | 140 +++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 coderd/prometheusmetrics/collector_test.go diff --git a/coderd/prometheusmetrics/collector_test.go b/coderd/prometheusmetrics/collector_test.go new file mode 100644 index 0000000000000..9d63f6669113d --- /dev/null +++ b/coderd/prometheusmetrics/collector_test.go @@ -0,0 +1,140 @@ +package prometheusmetrics_test + +import ( + "sort" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/prometheusmetrics" +) + +func TestCollector_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric { + ch := make(chan prometheus.Metric, count) + defer close(ch) + + var metrics []dto.Metric + + collector.Collect(ch) + for i := 0; i < count; i++ { + m := <-ch + + var metric dto.Metric + err := m.Write(&metric) + require.NoError(t, err) + + metrics = append(metrics, metric) + } + + // Ensure always the same order of metrics + sort.Slice(metrics, func(i, j int) bool { + return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()}) + }) + return metrics +} From 199e549a18f4a7d73185437a2315f1fd23c03617 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 7 Apr 2023 14:46:56 +0200 Subject: [PATCH 15/40] WIP --- coderd/prometheusmetrics/agentstats.go | 51 ++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 coderd/prometheusmetrics/agentstats.go diff --git a/coderd/prometheusmetrics/agentstats.go b/coderd/prometheusmetrics/agentstats.go new file mode 100644 index 0000000000000..1d53035f5375c --- /dev/null +++ b/coderd/prometheusmetrics/agentstats.go @@ -0,0 +1,51 @@ +package prometheusmetrics + +import ( + "context" + "time" + + "github.com/prometheus/client_golang/prometheus" + + "cdr.dev/slog" + "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" +) + +func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 1 * time.Minute + } + + metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agentstats_execution_seconds", + Help: "Histogram for duration of agent stats metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err := registerer.Register(metricsCollectorAgentStats) + if err != nil { + return nil, err + } + + ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgentStats) + + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) + } + }() + return cancelFunc, nil + +} From d0b839881a52c14737d3f539a179625d62ad28a0 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 08:39:38 +0200 Subject: [PATCH 16/40] WIP --- cli/server.go | 6 +++ coderd/prometheusmetrics/prometheusmetrics.go | 52 ++++++++++++++++++- 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/cli/server.go b/cli/server.go index 3726a17a1399a..f4ef841109eae 100644 --- a/cli/server.go +++ b/cli/server.go @@ -844,6 +844,12 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() + closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, 0) + if err != nil { + return xerrors.Errorf("register agent stats prometheus metric: %w", err) + } + defer closeAgentStatsFunc() + //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 83e4af90d0765..dac62515c6917 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -200,7 +200,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis }) if err != nil { logger.Error(ctx, "can't get workspace rows", slog.Error(err)) - continue + goto done } for _, workspace := range workspaceRows { @@ -283,9 +283,59 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis agentsConnectionLatenciesGauge.Commit() agentsAppsGauge.Commit() + done: logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } }() return cancelFunc, nil } + +func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 1 * time.Minute + } + + metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agentstats_execution_seconds", + Help: "Histogram for duration of agent stats metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err := registerer.Register(metricsCollectorAgentStats) + if err != nil { + return nil, err + } + + createdAfter := database.Now().Add(-duration) + ctx, cancelFunc := context.WithCancel(ctx) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgentStats) + + _, err := db.GetWorkspaceAgentStats(ctx, createdAfter) + if err != nil { + logger.Error(ctx, "can't get agent stats", slog.Error(err)) + goto done + } + + db.GetWorkspAgents + + done: + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) + } + }() + return cancelFunc, nil + +} From f0c041844e5168ee79b99ec7009dce5b734b1322 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 12:25:56 +0200 Subject: [PATCH 17/40] db: GetWorkspaceAgentStatsAndLabels --- coderd/database/dbauthz/system.go | 4 + coderd/database/dbfake/databasefake.go | 4 + coderd/database/querier.go | 1 + coderd/database/queries.sql.go | 85 +++++++++++++++++++ .../database/queries/workspaceagentstats.sql | 35 ++++++++ coderd/prometheusmetrics/agentstats.go | 51 ----------- coderd/prometheusmetrics/prometheusmetrics.go | 2 - 7 files changed, 129 insertions(+), 53 deletions(-) delete mode 100644 coderd/prometheusmetrics/agentstats.go diff --git a/coderd/database/dbauthz/system.go b/coderd/database/dbauthz/system.go index dd47cb635b080..90e3afc500969 100644 --- a/coderd/database/dbauthz/system.go +++ b/coderd/database/dbauthz/system.go @@ -302,6 +302,10 @@ func (q *querier) GetWorkspaceAgentStats(ctx context.Context, createdAfter time. return q.db.GetWorkspaceAgentStats(ctx, createdAfter) } +func (q *querier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) { + return q.db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter) +} + func (q *querier) GetDeploymentWorkspaceStats(ctx context.Context) (database.GetDeploymentWorkspaceStatsRow, error) { return q.db.GetDeploymentWorkspaceStats(ctx) } diff --git a/coderd/database/dbfake/databasefake.go b/coderd/database/dbfake/databasefake.go index cb4ada860a8a2..9cb4893c3e440 100644 --- a/coderd/database/dbfake/databasefake.go +++ b/coderd/database/dbfake/databasefake.go @@ -3998,6 +3998,10 @@ func (q *fakeQuerier) GetWorkspaceAgentStats(_ context.Context, createdAfter tim return stats, nil } +func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(_ context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) { + panic("not implemented yet") +} + func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) { q.mutex.RLock() defer q.mutex.RUnlock() diff --git a/coderd/database/querier.go b/coderd/database/querier.go index 5151aead8064c..ba7ad1a98e5a8 100644 --- a/coderd/database/querier.go +++ b/coderd/database/querier.go @@ -130,6 +130,7 @@ type sqlcQuerier interface { GetWorkspaceAgentMetadata(ctx context.Context, workspaceAgentID uuid.UUID) ([]WorkspaceAgentMetadatum, error) GetWorkspaceAgentStartupLogsAfter(ctx context.Context, arg GetWorkspaceAgentStartupLogsAfterParams) ([]WorkspaceAgentStartupLog, error) GetWorkspaceAgentStats(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsRow, error) + GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsAndLabelsRow, error) GetWorkspaceAgentsByResourceIDs(ctx context.Context, ids []uuid.UUID) ([]WorkspaceAgent, error) GetWorkspaceAgentsCreatedAfter(ctx context.Context, createdAt time.Time) ([]WorkspaceAgent, error) GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx context.Context, workspaceID uuid.UUID) ([]WorkspaceAgent, error) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 947371b8f69db..2cfa2f47fdade 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -6374,6 +6374,91 @@ func (q *sqlQuerier) GetWorkspaceAgentStats(ctx context.Context, createdAt time. return items, nil } +const getWorkspaceAgentStatsAndLabels = `-- name: GetWorkspaceAgentStatsAndLabels :many +WITH agent_stats AS ( + SELECT + user_id, + agent_id, + workspace_id, + coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes, + coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes + FROM workspace_agent_stats + WHERE workspace_agent_stats.created_at > $1 + GROUP BY user_id, agent_id, workspace_id +), latest_agent_stats AS ( + SELECT + a.agent_id, + coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode, + coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh, + coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, + coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, + coalesce(SUM(connection_count), 0)::bigint AS connection_count, + coalesce(SUM(connection_median_latency_ms), 0)::float AS connection_median_latency_ms + FROM ( + SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + FROM workspace_agent_stats + -- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms. + WHERE created_at > $1 AND connection_median_latency_ms > 0 + ) AS a + WHERE a.rn = 1 + GROUP BY a.user_id, a.agent_id, a.workspace_id +) +SELECT users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms +FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id +JOIN users ON users.id = agent_stats.user_id +JOIN workspace_agents ON workspace_agents.id = agent_stats.agent_id +JOIN workspaces ON workspaces.id = agent_stats.workspace_id +` + +type GetWorkspaceAgentStatsAndLabelsRow struct { + Username string `db:"username" json:"username"` + AgentName string `db:"agent_name" json:"agent_name"` + WorkspaceName string `db:"workspace_name" json:"workspace_name"` + WorkspaceRxBytes int64 `db:"workspace_rx_bytes" json:"workspace_rx_bytes"` + WorkspaceTxBytes int64 `db:"workspace_tx_bytes" json:"workspace_tx_bytes"` + SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` + SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` + SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` + SessionCountReconnectingPTY int64 `db:"session_count_reconnecting_pty" json:"session_count_reconnecting_pty"` + ConnectionCount int64 `db:"connection_count" json:"connection_count"` + ConnectionMedianLatencyMS float64 `db:"connection_median_latency_ms" json:"connection_median_latency_ms"` +} + +func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAt time.Time) ([]GetWorkspaceAgentStatsAndLabelsRow, error) { + rows, err := q.db.QueryContext(ctx, getWorkspaceAgentStatsAndLabels, createdAt) + if err != nil { + return nil, err + } + defer rows.Close() + var items []GetWorkspaceAgentStatsAndLabelsRow + for rows.Next() { + var i GetWorkspaceAgentStatsAndLabelsRow + if err := rows.Scan( + &i.Username, + &i.AgentName, + &i.WorkspaceName, + &i.WorkspaceRxBytes, + &i.WorkspaceTxBytes, + &i.SessionCountVSCode, + &i.SessionCountSSH, + &i.SessionCountJetBrains, + &i.SessionCountReconnectingPTY, + &i.ConnectionCount, + &i.ConnectionMedianLatencyMS, + ); err != nil { + return nil, err + } + items = append(items, i) + } + if err := rows.Close(); err != nil { + return nil, err + } + if err := rows.Err(); err != nil { + return nil, err + } + return items, nil +} + const insertWorkspaceAgentStat = `-- name: InsertWorkspaceAgentStat :one INSERT INTO workspace_agent_stats ( diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 2cfaa8fef9737..c21ac1f3dd8ce 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -103,3 +103,38 @@ WITH agent_stats AS ( ) AS a WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id, a.template_id ) SELECT * FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id; + +-- name: GetWorkspaceAgentStatsAndLabels :many +WITH agent_stats AS ( + SELECT + user_id, + agent_id, + workspace_id, + coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes, + coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes + FROM workspace_agent_stats + WHERE workspace_agent_stats.created_at > $1 + GROUP BY user_id, agent_id, workspace_id +), latest_agent_stats AS ( + SELECT + a.agent_id, + coalesce(SUM(session_count_vscode), 0)::bigint AS session_count_vscode, + coalesce(SUM(session_count_ssh), 0)::bigint AS session_count_ssh, + coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, + coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, + coalesce(SUM(connection_count), 0)::bigint AS connection_count, + coalesce(SUM(connection_median_latency_ms), 0)::float AS connection_median_latency_ms + FROM ( + SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn + FROM workspace_agent_stats + -- The greater than 0 is to support legacy agents that don't report connection_median_latency_ms. + WHERE created_at > $1 AND connection_median_latency_ms > 0 + ) AS a + WHERE a.rn = 1 + GROUP BY a.user_id, a.agent_id, a.workspace_id +) +SELECT users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms +FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id +JOIN users ON users.id = agent_stats.user_id +JOIN workspace_agents ON workspace_agents.id = agent_stats.agent_id +JOIN workspaces ON workspaces.id = agent_stats.workspace_id; diff --git a/coderd/prometheusmetrics/agentstats.go b/coderd/prometheusmetrics/agentstats.go deleted file mode 100644 index 1d53035f5375c..0000000000000 --- a/coderd/prometheusmetrics/agentstats.go +++ /dev/null @@ -1,51 +0,0 @@ -package prometheusmetrics - -import ( - "context" - "time" - - "github.com/prometheus/client_golang/prometheus" - - "cdr.dev/slog" - "github.com/coder/coder/coderd/database" - "github.com/coder/coder/coderd/database/dbauthz" -) - -func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { - if duration == 0 { - duration = 1 * time.Minute - } - - metricsCollectorAgentStats := prometheus.NewHistogram(prometheus.HistogramOpts{ - Namespace: "coderd", - Subsystem: "prometheusmetrics", - Name: "agentstats_execution_seconds", - Help: "Histogram for duration of agent stats metrics collection in seconds.", - Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, - }) - err := registerer.Register(metricsCollectorAgentStats) - if err != nil { - return nil, err - } - - ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) - ticker := time.NewTicker(duration) - go func() { - defer ticker.Stop() - for { - select { - case <-ctx.Done(): - return - case <-ticker.C: - } - - logger.Debug(ctx, "Agent metrics collection is starting") - timer := prometheus.NewTimer(metricsCollectorAgentStats) - - logger.Debug(ctx, "Agent metrics collection is done") - metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) - } - }() - return cancelFunc, nil - -} diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index dac62515c6917..f97c5feda3b99 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -329,8 +329,6 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R goto done } - db.GetWorkspAgents - done: logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) From 970d35a8f4d611723d0bb8dce86a874436864419 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 12:34:05 +0200 Subject: [PATCH 18/40] fmt --- coderd/prometheusmetrics/prometheusmetrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index f97c5feda3b99..9da151f8ae964 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -335,5 +335,4 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R } }() return cancelFunc, nil - } From 229f546862c13669d9257f10996a518b5715e1fa Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 13:06:32 +0200 Subject: [PATCH 19/40] WIP --- coderd/prometheusmetrics/prometheusmetrics.go | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 9da151f8ae964..1f6ff0ca23e99 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -151,7 +151,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})) + }, []string{"agent_name", "username", "workspace_name", "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err @@ -308,6 +308,28 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } + agentStatsTxBytesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "tx_bytes", + Help: "Agent Tx bytes", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsTxBytesGauge) + if err != nil { + return nil, err + } + + agentStatsRxBytesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "rx_bytes", + Help: "Agent Rx bytes", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsRxBytesGauge) + if err != nil { + return nil, err + } + createdAfter := database.Now().Add(-duration) ctx, cancelFunc := context.WithCancel(ctx) ticker := time.NewTicker(duration) @@ -323,12 +345,20 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R logger.Debug(ctx, "Agent metrics collection is starting") timer := prometheus.NewTimer(metricsCollectorAgentStats) - _, err := db.GetWorkspaceAgentStats(ctx, createdAfter) + stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter) if err != nil { logger.Error(ctx, "can't get agent stats", slog.Error(err)) goto done } + for _, agentStat := range stats { + agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + } + + agentStatsRxBytesGauge.Commit() + agentStatsTxBytesGauge.Commit() + done: logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) From 8c6f96bbe3b64b223fb89d86441bb49b8ec4dd45 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 13:57:31 +0200 Subject: [PATCH 20/40] gauges --- coderd/prometheusmetrics/prometheusmetrics.go | 86 ++++++++++++++++++- docs/admin/prometheus.md | 10 ++- scripts/metricsdocgen/metrics | 30 ++++++- 3 files changed, 121 insertions(+), 5 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 1f6ff0ca23e99..f6770bfbc7e57 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -330,7 +330,73 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } - createdAfter := database.Now().Add(-duration) + agentStatsConnectionCountGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "connection_count", + Help: "The number of established connections by agent", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsConnectionCountGauge) + if err != nil { + return nil, err + } + + agentStatsConnectionMedianLatencyGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "connection_median_latency", + Help: "The median agent connection latency", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsConnectionMedianLatencyGauge) + if err != nil { + return nil, err + } + + agentStatsSessionCountJetBrainsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "session_count_jetbrains", + Help: "The number of session established by JetBrains", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsSessionCountJetBrainsGauge) + if err != nil { + return nil, err + } + + agentStatsSessionCountReconnectingPTYGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "session_count_reconnecting_pty", + Help: "The number of session established by reconnecting PTY", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsSessionCountReconnectingPTYGauge) + if err != nil { + return nil, err + } + + agentStatsSessionCountSSHGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "session_count_ssh", + Help: "The number of session established by SSH", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsSessionCountSSHGauge) + if err != nil { + return nil, err + } + + agentStatsSessionCountVSCodeGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agentstats", + Name: "session_count_vscode", + Help: "The number of session established by VSCode", + }, []string{"agent_name", "username", "workspace_name"})) + err = registerer.Register(agentStatsSessionCountVSCodeGauge) + if err != nil { + return nil, err + } + + createdAfter := time.Now() ctx, cancelFunc := context.WithCancel(ctx) ticker := time.NewTicker(duration) go func() { @@ -354,14 +420,32 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R for _, agentStat := range stats { agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + + agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + + agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) } agentStatsRxBytesGauge.Commit() agentStatsTxBytesGauge.Commit() + agentStatsConnectionCountGauge.Commit() + agentStatsConnectionMedianLatencyGauge.Commit() + + agentStatsSessionCountJetBrainsGauge.Commit() + agentStatsSessionCountReconnectingPTYGauge.Commit() + agentStatsSessionCountSSHGauge.Commit() + agentStatsSessionCountVSCodeGauge.Commit() + done: logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) + + createdAfter = time.Now() } }() return cancelFunc, nil diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index 2898f8f4a469c..37067be459599 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -32,9 +32,17 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically | Name | Type | Description | Labels | | --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | | `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | | `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | | `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_connection_median_latency` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | | `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | | `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | | `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 7e598b17abe56..7942886d0ce15 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -5,9 +5,9 @@ coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",use coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1 # HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds. # TYPE coderd_agents_connection_latencies_seconds gauge -coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 -coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 -coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 +coderd_agents_connection_latencies_seconds{agent_name="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 +coderd_agents_connection_latencies_seconds{agent_name="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 +coderd_agents_connection_latencies_seconds{agent_name="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 # HELP coderd_agents_connections Agent connections with statuses. # TYPE coderd_agents_connections gauge coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1 @@ -18,6 +18,30 @@ coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",stat coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agentstats_connection_count The number of established connections by agent +# TYPE coderd_agentstats_connection_count gauge +coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_name="workspace1"} 2 +# HELP coderd_agentstats_connection_median_latency The median agent connection latency +# TYPE coderd_agentstats_connection_median_latency gauge +coderd_agentstats_connection_median_latency{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784 +# HELP coderd_agentstats_rx_bytes Agent Rx bytes +# TYPE coderd_agentstats_rx_bytes gauge +coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731 +# HELP coderd_agentstats_session_count_jetbrains The number of session established by JetBrains +# TYPE coderd_agentstats_session_count_jetbrains gauge +coderd_agentstats_session_count_jetbrains{agent_name="main",username="admin",workspace_name="workspace1"} 0 +# HELP coderd_agentstats_session_count_reconnecting_pty The number of session established by reconnecting PTY +# TYPE coderd_agentstats_session_count_reconnecting_pty gauge +coderd_agentstats_session_count_reconnecting_pty{agent_name="main",username="admin",workspace_name="workspace1"} 1 +# HELP coderd_agentstats_session_count_ssh The number of session established by SSH +# TYPE coderd_agentstats_session_count_ssh gauge +coderd_agentstats_session_count_ssh{agent_name="main",username="admin",workspace_name="workspace1"} 0 +# HELP coderd_agentstats_session_count_vscode The number of session established by VSCode +# TYPE coderd_agentstats_session_count_vscode gauge +coderd_agentstats_session_count_vscode{agent_name="main",username="admin",workspace_name="workspace1"} 0 +# HELP coderd_agentstats_tx_bytes Agent Tx bytes +# TYPE coderd_agentstats_tx_bytes gauge +coderd_agentstats_tx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 6643 # HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds. # TYPE coderd_api_websocket_durations_seconds histogram coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0 From 1ed37b45ba89e43d5c81683c5243f432aec19031 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 16:35:32 +0200 Subject: [PATCH 21/40] feat: collect --- cli/server.go | 2 +- coderd/database/dbfake/databasefake.go | 71 +++++++++- coderd/database/queries.sql.go | 2 +- .../database/queries/workspaceagentstats.sql | 2 +- coderd/prometheusmetrics/prometheusmetrics.go | 16 ++- .../prometheusmetrics_test.go | 84 ++++++++++++ docs/admin/prometheus.md | 122 +++++++++--------- scripts/metricsdocgen/metrics | 6 +- 8 files changed, 231 insertions(+), 74 deletions(-) diff --git a/cli/server.go b/cli/server.go index f4ef841109eae..a37fd14428c77 100644 --- a/cli/server.go +++ b/cli/server.go @@ -844,7 +844,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, 0) + closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0) if err != nil { return xerrors.Errorf("register agent stats prometheus metric: %w", err) } diff --git a/coderd/database/dbfake/databasefake.go b/coderd/database/dbfake/databasefake.go index 9cb4893c3e440..adebfefbdb51f 100644 --- a/coderd/database/dbfake/databasefake.go +++ b/coderd/database/dbfake/databasefake.go @@ -3998,8 +3998,75 @@ func (q *fakeQuerier) GetWorkspaceAgentStats(_ context.Context, createdAfter tim return stats, nil } -func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(_ context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) { - panic("not implemented yet") +func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, createdAfter time.Time) ([]database.GetWorkspaceAgentStatsAndLabelsRow, error) { + q.mutex.RLock() + defer q.mutex.RUnlock() + + agentStatsCreatedAfter := make([]database.WorkspaceAgentStat, 0) + latestAgentStats := map[uuid.UUID]database.WorkspaceAgentStat{} + + for _, agentStat := range q.workspaceAgentStats { + if agentStat.CreatedAt.After(createdAfter) { + agentStatsCreatedAfter = append(agentStatsCreatedAfter, agentStat) + latestAgentStats[agentStat.AgentID] = agentStat + } + } + + statByAgent := map[uuid.UUID]database.GetWorkspaceAgentStatsAndLabelsRow{} + + // Session and connection metrics + for _, agentStat := range latestAgentStats { + stat := statByAgent[agentStat.AgentID] + stat.SessionCountVSCode += agentStat.SessionCountVSCode + stat.SessionCountJetBrains += agentStat.SessionCountJetBrains + stat.SessionCountReconnectingPTY += agentStat.SessionCountReconnectingPTY + stat.SessionCountSSH += agentStat.SessionCountSSH + stat.ConnectionCount += agentStat.ConnectionCount + if agentStat.ConnectionMedianLatencyMS >= 0 && stat.ConnectionMedianLatencyMS < agentStat.ConnectionMedianLatencyMS { + stat.ConnectionMedianLatencyMS = agentStat.ConnectionMedianLatencyMS + } + statByAgent[agentStat.AgentID] = stat + } + + // Tx, Rx metrics + for _, agentStat := range agentStatsCreatedAfter { + stat := statByAgent[agentStat.AgentID] + stat.WorkspaceRxBytes += agentStat.RxBytes + stat.WorkspaceTxBytes += agentStat.TxBytes + statByAgent[agentStat.AgentID] = stat + } + + // Labels + for _, agentStat := range agentStatsCreatedAfter { + stat := statByAgent[agentStat.AgentID] + + user, err := q.getUserByIDNoLock(agentStat.UserID) + if err != nil { + return nil, err + } + + stat.Username = user.Username + + workspace, err := q.GetWorkspaceByID(ctx, agentStat.WorkspaceID) + if err != nil { + return nil, err + } + stat.WorkspaceName = workspace.Name + + agent, err := q.GetWorkspaceAgentByID(ctx, agentStat.AgentID) + if err != nil { + return nil, err + } + stat.AgentName = agent.Name + + statByAgent[agentStat.AgentID] = stat + } + + stats := make([]database.GetWorkspaceAgentStatsAndLabelsRow, 0, len(statByAgent)) + for _, agent := range statByAgent { + stats = append(stats, agent) + } + return stats, nil } func (q *fakeQuerier) GetWorkspacesEligibleForAutoStartStop(ctx context.Context, now time.Time) ([]database.Workspace, error) { diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 2cfa2f47fdade..080feebfb349d 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -6393,7 +6393,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, coalesce(SUM(connection_count), 0)::bigint AS connection_count, - coalesce(SUM(connection_median_latency_ms), 0)::float AS connection_median_latency_ms + coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms FROM ( SELECT id, created_at, user_id, agent_id, workspace_id, template_id, connections_by_proto, connection_count, rx_packets, rx_bytes, tx_packets, tx_bytes, connection_median_latency_ms, session_count_vscode, session_count_jetbrains, session_count_reconnecting_pty, session_count_ssh, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index c21ac1f3dd8ce..85385802f987e 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -123,7 +123,7 @@ WITH agent_stats AS ( coalesce(SUM(session_count_jetbrains), 0)::bigint AS session_count_jetbrains, coalesce(SUM(session_count_reconnecting_pty), 0)::bigint AS session_count_reconnecting_pty, coalesce(SUM(connection_count), 0)::bigint AS connection_count, - coalesce(SUM(connection_median_latency_ms), 0)::float AS connection_median_latency_ms + coalesce(MAX(connection_median_latency_ms), 0)::float AS connection_median_latency_ms FROM ( SELECT *, ROW_NUMBER() OVER(PARTITION BY agent_id ORDER BY created_at DESC) AS rn FROM workspace_agent_stats diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index f6770bfbc7e57..428ea2a555f9c 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -291,7 +291,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return cancelFunc, nil } -func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { +func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 1 * time.Minute } @@ -344,8 +344,8 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R agentStatsConnectionMedianLatencyGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agentstats", - Name: "connection_median_latency", - Help: "The median agent connection latency", + Name: "connection_median_latency_seconds", + Help: "The median agent connection latency in seconds", }, []string{"agent_name", "username", "workspace_name"})) err = registerer.Register(agentStatsConnectionMedianLatencyGauge) if err != nil { @@ -396,7 +396,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } - createdAfter := time.Now() + createdAfter := initialCreateAfter ctx, cancelFunc := context.WithCancel(ctx) ticker := time.NewTicker(duration) go func() { @@ -411,12 +411,18 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R logger.Debug(ctx, "Agent metrics collection is starting") timer := prometheus.NewTimer(metricsCollectorAgentStats) + checkpoint := time.Now() stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter) + if err != nil { logger.Error(ctx, "can't get agent stats", slog.Error(err)) goto done } + if len(stats) == 0 { + goto done + } + for _, agentStat := range stats { agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) @@ -445,7 +451,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) - createdAfter = time.Now() + createdAfter = checkpoint } }() return cancelFunc, nil diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e765c5f2a1128..014969a6e9428 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -20,6 +20,7 @@ import ( "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/codersdk/agentsdk" "github.com/coder/coder/provisioner/echo" "github.com/coder/coder/provisionersdk/proto" "github.com/coder/coder/tailnet" @@ -352,3 +353,86 @@ func TestAgents(t *testing.T) { return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds }, testutil.WaitShort, testutil.IntervalFast) } + +func TestAgentStats(t *testing.T) { + t.Parallel() + + // Build a sample workspace with test agent and fake agent client + client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + db := api.Database + + user := coderdtest.CreateFirstUser(t, client) + authToken := uuid.NewString() + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: echo.ProvisionApplyWithAgent(authToken), + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(authToken) + + registry := prometheus.NewRegistry() + + // given + cancel, err := prometheusmetrics.AgentStats(context.Background(), slogtest.Make(t, nil), registry, db, time.Now(), time.Second) + require.NoError(t, err) + t.Cleanup(cancel) + + // when + _, err = agentClient.PostStats(context.Background(), &agentsdk.Stats{ + ConnectionsByProto: map[string]int64{"TCP": 1}, + ConnectionCount: 2, + RxPackets: 3, + RxBytes: 4, + TxPackets: 5, + TxBytes: 6, + SessionCountVSCode: 7, + SessionCountJetBrains: 8, + SessionCountReconnectingPTY: 9, + SessionCountSSH: 10, + ConnectionMedianLatencyMS: 10000, + }) + require.NoError(t, err) + + // then + require.NoError(t, err) + + collectedMetrics := map[string]struct{}{} + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + if len(metrics) < 1 { + return false + } + + for _, metric := range metrics { + switch metric.GetName() { + case "coderd_prometheusmetrics_agentstats_execution_seconds": + collectedMetrics[metric.GetName()] = struct{}{} + case "coderd_agentstats_connection_count", + "coderd_agentstats_connection_median_latency_seconds", + "coderd_agentstats_rx_bytes", + "coderd_agentstats_tx_bytes", + "coderd_agentstats_session_count_jetbrains", + "coderd_agentstats_session_count_reconnecting_pty", + "coderd_agentstats_session_count_ssh", + "coderd_agentstats_session_count_vscode": + collectedMetrics[metric.GetName()] = struct{}{} + assert.Equal(t, "example", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "testuser", metric.Metric[0].Label[1].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[2].GetValue()) // Workspace name + assert.NotZero(t, int(metric.Metric[0].Gauge.GetValue()), metric.GetName()) // Metric value + default: + require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + } + } + + return len(collectedMetrics) == 9 + }, testutil.WaitShort, testutil.IntervalFast, "collected metrics: %v", collectedMetrics) +} diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index 37067be459599..5c9ed54efd247 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,66 +29,66 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | -| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_connection_median_latency` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | -| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| ----------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_name` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_agentstats_connection_count` | gauge | The number of established connections by agent | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_connection_median_latency_seconds` | gauge | The median agent connection latency | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_rx_bytes` | gauge | Agent Rx bytes | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_jetbrains` | gauge | The number of session established by JetBrains | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_reconnecting_pty` | gauge | The number of session established by reconnecting PTY | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_ssh` | gauge | The number of session established by SSH | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_session_count_vscode` | gauge | The number of session established by VSCode | `agent_name` `username` `workspace_name` | +| `coderd_agentstats_tx_bytes` | gauge | Agent Tx bytes | `agent_name` `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 7942886d0ce15..117f55c5fc307 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -21,9 +21,9 @@ coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 # HELP coderd_agentstats_connection_count The number of established connections by agent # TYPE coderd_agentstats_connection_count gauge coderd_agentstats_connection_count{agent_name="main",username="admin",workspace_name="workspace1"} 2 -# HELP coderd_agentstats_connection_median_latency The median agent connection latency -# TYPE coderd_agentstats_connection_median_latency gauge -coderd_agentstats_connection_median_latency{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784 +# HELP coderd_agentstats_connection_median_latency_seconds The median agent connection latency +# TYPE coderd_agentstats_connection_median_latency_seconds gauge +coderd_agentstats_connection_median_latency_seconds{agent_name="main",username="admin",workspace_name="workspace1"} 0.001784 # HELP coderd_agentstats_rx_bytes Agent Rx bytes # TYPE coderd_agentstats_rx_bytes gauge coderd_agentstats_rx_bytes{agent_name="main",username="admin",workspace_name="workspace1"} 7731 From 7ee1bfc62960470a33380da324d70d94efa33643 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 16:37:41 +0200 Subject: [PATCH 22/40] fix --- coderd/prometheusmetrics/prometheusmetrics_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 014969a6e9428..5f39572d5040d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -397,7 +397,6 @@ func TestAgentStats(t *testing.T) { SessionCountSSH: 10, ConnectionMedianLatencyMS: 10000, }) - require.NoError(t, err) // then require.NoError(t, err) From 2b8a9e45b12b540ae87a99cca9b2b0fe46e17400 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 13 Apr 2023 16:40:24 +0200 Subject: [PATCH 23/40] fmt --- coderd/prometheusmetrics/prometheusmetrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 428ea2a555f9c..3c49b102095ce 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -413,7 +413,6 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R checkpoint := time.Now() stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter) - if err != nil { logger.Error(ctx, "can't get agent stats", slog.Error(err)) goto done From 322f7e8a5bf8af8f3cb1ddf47409c20a5626737b Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 09:57:34 +0200 Subject: [PATCH 24/40] minor fixes --- coderd/database/queries.sql.go | 27 +++++++++++++++---- .../database/queries/workspaceagentstats.sql | 27 +++++++++++++++---- coderd/prometheusmetrics/prometheusmetrics.go | 3 +++ 3 files changed, 47 insertions(+), 10 deletions(-) diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index 080feebfb349d..ffdc5bcfb156b 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -6403,11 +6403,28 @@ WITH agent_stats AS ( WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id ) -SELECT users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms -FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id -JOIN users ON users.id = agent_stats.user_id -JOIN workspace_agents ON workspace_agents.id = agent_stats.agent_id -JOIN workspaces ON workspaces.id = agent_stats.workspace_id +SELECT + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, + session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, + connection_count, connection_median_latency_ms +FROM + agent_stats +JOIN + latest_agent_stats +ON + agent_stats.agent_id = latest_agent_stats.agent_id +JOIN + users +ON + users.id = agent_stats.user_id +JOIN + workspace_agents +ON + workspace_agents.id = agent_stats.agent_id +JOIN + workspaces +ON + workspaces.id = agent_stats.workspace_id ` type GetWorkspaceAgentStatsAndLabelsRow struct { diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 85385802f987e..520e003b10fea 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -133,8 +133,25 @@ WITH agent_stats AS ( WHERE a.rn = 1 GROUP BY a.user_id, a.agent_id, a.workspace_id ) -SELECT users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms -FROM agent_stats JOIN latest_agent_stats ON agent_stats.agent_id = latest_agent_stats.agent_id -JOIN users ON users.id = agent_stats.user_id -JOIN workspace_agents ON workspace_agents.id = agent_stats.agent_id -JOIN workspaces ON workspaces.id = agent_stats.workspace_id; +SELECT + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, + session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, + connection_count, connection_median_latency_ms +FROM + agent_stats +JOIN + latest_agent_stats +ON + agent_stats.agent_id = latest_agent_stats.agent_id +JOIN + users +ON + users.id = agent_stats.user_id +JOIN + workspace_agents +ON + workspace_agents.id = agent_stats.agent_id +JOIN + workspaces +ON + workspaces.id = agent_stats.workspace_id; diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 3c49b102095ce..b7a425294bac4 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -286,6 +286,8 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis done: logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) + + ticker.Reset(duration) } }() return cancelFunc, nil @@ -451,6 +453,7 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) createdAfter = checkpoint + ticker.Reset(duration) } }() return cancelFunc, nil From c7af75a318f0b152d3f24a03900e560ac0fee1d6 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 10:23:05 +0200 Subject: [PATCH 25/40] Prometheus flag --- cli/server.go | 10 ++++++---- cli/testdata/coder_server_--help.golden | 3 +++ cli/testdata/server-config.yaml.golden | 3 +++ coderd/apidoc/docs.go | 3 +++ coderd/apidoc/swagger.json | 3 +++ codersdk/deployment.go | 14 ++++++++++++-- docs/api/general.md | 1 + docs/api/schemas.md | 12 ++++++++---- docs/cli/server.md | 10 ++++++++++ site/src/api/typesGenerated.ts | 1 + 10 files changed, 50 insertions(+), 10 deletions(-) diff --git a/cli/server.go b/cli/server.go index a37fd14428c77..9ccbcf2878a9e 100644 --- a/cli/server.go +++ b/cli/server.go @@ -844,11 +844,13 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0) - if err != nil { - return xerrors.Errorf("register agent stats prometheus metric: %w", err) + if cfg.Prometheus.CollectAgentStats { + closeAgentStatsFunc, err := prometheusmetrics.AgentStats(ctx, logger, options.PrometheusRegistry, options.Database, time.Now(), 0) + if err != nil { + return xerrors.Errorf("register agent stats prometheus metric: %w", err) + } + defer closeAgentStatsFunc() } - defer closeAgentStatsFunc() //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( diff --git a/cli/testdata/coder_server_--help.golden b/cli/testdata/coder_server_--help.golden index be3274f8bf1a2..a32a1adc72771 100644 --- a/cli/testdata/coder_server_--help.golden +++ b/cli/testdata/coder_server_--help.golden @@ -90,6 +90,9 @@ Use a YAML configuration file when your server launch become unwieldy. --prometheus-address host:port, $CODER_PROMETHEUS_ADDRESS (default: 127.0.0.1:2112) The bind address to serve prometheus metrics. + --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS + Collect agent stats (may increase charges for metrics storage) . + --prometheus-enable bool, $CODER_PROMETHEUS_ENABLE Serve prometheus metrics on the address defined by prometheus address. diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 5876107294df8..5797b0fa86f5f 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -146,6 +146,9 @@ introspection: # The bind address to serve prometheus metrics. # (default: 127.0.0.1:2112, type: host:port) address: 127.0.0.1:2112 + # Collect agent stats (may increase charges for metrics storage) . + # (default: , type: bool) + collect_agent_stats: false pprof: # Serve pprof metrics on the address defined by pprof address. # (default: , type: bool) diff --git a/coderd/apidoc/docs.go b/coderd/apidoc/docs.go index adbaf616695a2..17e71c421bad9 100644 --- a/coderd/apidoc/docs.go +++ b/coderd/apidoc/docs.go @@ -7822,6 +7822,9 @@ const docTemplate = `{ "address": { "$ref": "#/definitions/clibase.HostPort" }, + "collect_agent_stats": { + "type": "boolean" + }, "enable": { "type": "boolean" } diff --git a/coderd/apidoc/swagger.json b/coderd/apidoc/swagger.json index bc4998b3b39ef..4ff29ab4dba1b 100644 --- a/coderd/apidoc/swagger.json +++ b/coderd/apidoc/swagger.json @@ -7008,6 +7008,9 @@ "address": { "$ref": "#/definitions/clibase.HostPort" }, + "collect_agent_stats": { + "type": "boolean" + }, "enable": { "type": "boolean" } diff --git a/codersdk/deployment.go b/codersdk/deployment.go index 71b643e32290e..d56d0cc2e66b3 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -225,8 +225,9 @@ type DERPConfig struct { } type PrometheusConfig struct { - Enable clibase.Bool `json:"enable" typescript:",notnull"` - Address clibase.HostPort `json:"address" typescript:",notnull"` + Enable clibase.Bool `json:"enable" typescript:",notnull"` + Address clibase.HostPort `json:"address" typescript:",notnull"` + CollectAgentStats clibase.Bool `json:"collect_agent_stats" typescript:",notnull"` } type PprofConfig struct { @@ -722,6 +723,15 @@ when required by your organization's security policy.`, Group: &deploymentGroupIntrospectionPrometheus, YAML: "address", }, + { + Name: "Prometheus Collect Agent Stats", + Description: "Collect agent stats (may increase charges for metrics storage) .", + Flag: "prometheus-collect-agent-stats", + Env: "CODER_PROMETHEUS_COLLECT_AGENT_STATS", + Value: &c.Prometheus.CollectAgentStats, + Group: &deploymentGroupIntrospectionPrometheus, + YAML: "collect_agent_stats", + }, // Pprof settings { Name: "pprof Enable", diff --git a/docs/api/general.md b/docs/api/general.md index 74eb0238e2001..ce1a796a81513 100644 --- a/docs/api/general.md +++ b/docs/api/general.md @@ -271,6 +271,7 @@ curl -X GET http://coder-server:8080/api/v2/deployment/config \ "host": "string", "port": "string" }, + "collect_agent_stats": true, "enable": true }, "provisioner": { diff --git a/docs/api/schemas.md b/docs/api/schemas.md index abf704fe7ffae..4bae18bff8657 100644 --- a/docs/api/schemas.md +++ b/docs/api/schemas.md @@ -1901,6 +1901,7 @@ CreateParameterRequest is a structure used to create a new parameter value for a "host": "string", "port": "string" }, + "collect_agent_stats": true, "enable": true }, "provisioner": { @@ -2244,6 +2245,7 @@ CreateParameterRequest is a structure used to create a new parameter value for a "host": "string", "port": "string" }, + "collect_agent_stats": true, "enable": true }, "provisioner": { @@ -3155,16 +3157,18 @@ Parameter represents a set value for the scope. "host": "string", "port": "string" }, + "collect_agent_stats": true, "enable": true } ``` ### Properties -| Name | Type | Required | Restrictions | Description | -| --------- | ------------------------------------ | -------- | ------------ | ----------- | -| `address` | [clibase.HostPort](#clibasehostport) | false | | | -| `enable` | boolean | false | | | +| Name | Type | Required | Restrictions | Description | +| --------------------- | ------------------------------------ | -------- | ------------ | ----------- | +| `address` | [clibase.HostPort](#clibasehostport) | false | | | +| `collect_agent_stats` | boolean | false | | | +| `enable` | boolean | false | | | ## codersdk.ProvisionerConfig diff --git a/docs/cli/server.md b/docs/cli/server.md index 603b2788f7df9..5de02886e6bb0 100644 --- a/docs/cli/server.md +++ b/docs/cli/server.md @@ -555,6 +555,16 @@ URL of a PostgreSQL database. If empty, PostgreSQL binaries will be downloaded f The bind address to serve prometheus metrics. +### --prometheus-collect-agent-stats + +| | | +| ----------- | --------------------------------------------------------- | +| Type | bool | +| Environment | $CODER_PROMETHEUS_COLLECT_AGENT_STATS | +| YAML | introspection.prometheus.collect_agent_stats | + +Collect agent stats (may increase charges for metrics storage) . + ### --prometheus-enable | | | diff --git a/site/src/api/typesGenerated.ts b/site/src/api/typesGenerated.ts index 36d7b16bcdef7..e4a851b1249f6 100644 --- a/site/src/api/typesGenerated.ts +++ b/site/src/api/typesGenerated.ts @@ -627,6 +627,7 @@ export interface PrometheusConfig { // Named type "github.com/coder/coder/cli/clibase.HostPort" unknown, using "any" // eslint-disable-next-line @typescript-eslint/no-explicit-any -- External type readonly address: any + readonly collect_agent_stats: boolean } // From codersdk/deployment.go From 9693fa8cce6fd8298379f45a0cfc39dadec9f621 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 10:54:56 +0200 Subject: [PATCH 26/40] fix --- cli/testdata/coder_server_--help.golden | 2 +- cli/testdata/server-config.yaml.golden | 2 +- codersdk/deployment.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cli/testdata/coder_server_--help.golden b/cli/testdata/coder_server_--help.golden index a32a1adc72771..446539df00d4c 100644 --- a/cli/testdata/coder_server_--help.golden +++ b/cli/testdata/coder_server_--help.golden @@ -91,7 +91,7 @@ Use a YAML configuration file when your server launch become unwieldy. The bind address to serve prometheus metrics. --prometheus-collect-agent-stats bool, $CODER_PROMETHEUS_COLLECT_AGENT_STATS - Collect agent stats (may increase charges for metrics storage) . + Collect agent stats (may increase charges for metrics storage). --prometheus-enable bool, $CODER_PROMETHEUS_ENABLE Serve prometheus metrics on the address defined by prometheus address. diff --git a/cli/testdata/server-config.yaml.golden b/cli/testdata/server-config.yaml.golden index 5797b0fa86f5f..99e22f3dcbd72 100644 --- a/cli/testdata/server-config.yaml.golden +++ b/cli/testdata/server-config.yaml.golden @@ -146,7 +146,7 @@ introspection: # The bind address to serve prometheus metrics. # (default: 127.0.0.1:2112, type: host:port) address: 127.0.0.1:2112 - # Collect agent stats (may increase charges for metrics storage) . + # Collect agent stats (may increase charges for metrics storage). # (default: , type: bool) collect_agent_stats: false pprof: diff --git a/codersdk/deployment.go b/codersdk/deployment.go index d56d0cc2e66b3..dee95504e97eb 100644 --- a/codersdk/deployment.go +++ b/codersdk/deployment.go @@ -725,7 +725,7 @@ when required by your organization's security policy.`, }, { Name: "Prometheus Collect Agent Stats", - Description: "Collect agent stats (may increase charges for metrics storage) .", + Description: "Collect agent stats (may increase charges for metrics storage).", Flag: "prometheus-collect-agent-stats", Env: "CODER_PROMETHEUS_COLLECT_AGENT_STATS", Value: &c.Prometheus.CollectAgentStats, From 28f7a13216d75afec3a5fc723ddb6fd62b009d40 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 12:51:03 +0200 Subject: [PATCH 27/40] WIP --- coderd/prometheusmetrics/prometheusmetrics.go | 50 ++++----- .../prometheusmetrics_test.go | 104 ++++++++++++------ .../testdata/agent-stats.json | 26 +++++ 3 files changed, 118 insertions(+), 62 deletions(-) create mode 100644 coderd/prometheusmetrics/testdata/agent-stats.json diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index b7a425294bac4..7abd129619a53 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -417,38 +417,34 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R stats, err := db.GetWorkspaceAgentStatsAndLabels(ctx, createdAfter) if err != nil { logger.Error(ctx, "can't get agent stats", slog.Error(err)) - goto done - } - - if len(stats) == 0 { - goto done - } + } else { + for _, agentStat := range stats { + agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + + agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + + agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + } - for _, agentStat := range stats { - agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + if len(stats) > 0 { + agentStatsRxBytesGauge.Commit() + agentStatsTxBytesGauge.Commit() - agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsConnectionCountGauge.Commit() + agentStatsConnectionMedianLatencyGauge.Commit() - agentStatsSessionCountJetBrainsGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountJetBrains), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountReconnectingPTYGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountReconnectingPTY), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountSSHGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountSSH), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsSessionCountVSCodeGauge.WithLabelValues(VectorOperationSet, float64(agentStat.SessionCountVSCode), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsSessionCountJetBrainsGauge.Commit() + agentStatsSessionCountReconnectingPTYGauge.Commit() + agentStatsSessionCountSSHGauge.Commit() + agentStatsSessionCountVSCodeGauge.Commit() + } } - agentStatsRxBytesGauge.Commit() - agentStatsTxBytesGauge.Commit() - - agentStatsConnectionCountGauge.Commit() - agentStatsConnectionMedianLatencyGauge.Commit() - - agentStatsSessionCountJetBrainsGauge.Commit() - agentStatsSessionCountReconnectingPTYGauge.Commit() - agentStatsSessionCountSSHGauge.Commit() - agentStatsSessionCountVSCodeGauge.Commit() - - done: logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgentStats.Observe(timer.ObserveDuration().Seconds()) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 5f39572d5040d..fd341089733dc 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -3,6 +3,9 @@ package prometheusmetrics_test import ( "context" "database/sql" + "encoding/json" + "fmt" + "os" "sync/atomic" "testing" "time" @@ -357,24 +360,15 @@ func TestAgents(t *testing.T) { func TestAgentStats(t *testing.T) { t.Parallel() - // Build a sample workspace with test agent and fake agent client + // Build sample workspaces with test agents and fake agent client client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) db := api.Database user := coderdtest.CreateFirstUser(t, client) - authToken := uuid.NewString() - version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ - Parse: echo.ParseComplete, - ProvisionPlan: echo.ProvisionComplete, - ProvisionApply: echo.ProvisionApplyWithAgent(authToken), - }) - template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) - coderdtest.AwaitTemplateVersionJob(t, client, version.ID) - workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) - coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) - agentClient := agentsdk.New(client.URL) - agentClient.SetSessionToken(authToken) + agent1, _ := prepareWorkspaceAndAgent(t, client, user, 1) + agent2, _ := prepareWorkspaceAndAgent(t, client, user, 2) + agent3, _ := prepareWorkspaceAndAgent(t, client, user, 3) registry := prometheus.NewRegistry() @@ -384,24 +378,45 @@ func TestAgentStats(t *testing.T) { t.Cleanup(cancel) // when - _, err = agentClient.PostStats(context.Background(), &agentsdk.Stats{ - ConnectionsByProto: map[string]int64{"TCP": 1}, - ConnectionCount: 2, - RxPackets: 3, - RxBytes: 4, - TxPackets: 5, - TxBytes: 6, - SessionCountVSCode: 7, - SessionCountJetBrains: 8, - SessionCountReconnectingPTY: 9, - SessionCountSSH: 10, - ConnectionMedianLatencyMS: 10000, - }) + var i int64 + for i = 0; i < 3; i++ { + _, err = agent1.PostStats(context.Background(), &agentsdk.Stats{ + TxBytes: 1 + i, RxBytes: 2 + i, + SessionCountVSCode: 3 + i, SessionCountJetBrains: 4 + i, SessionCountReconnectingPTY: 5 + i, SessionCountSSH: 6 + i, + ConnectionCount: 7 + i, ConnectionMedianLatencyMS: 8000, + ConnectionsByProto: map[string]int64{"TCP": 1}, + }) + require.NoError(t, err) + + _, err = agent2.PostStats(context.Background(), &agentsdk.Stats{ + TxBytes: 2 + i, RxBytes: 4 + i, + SessionCountVSCode: 6 + i, SessionCountJetBrains: 8 + i, SessionCountReconnectingPTY: 10 + i, SessionCountSSH: 12 + i, + ConnectionCount: 8 + i, ConnectionMedianLatencyMS: 10000, + ConnectionsByProto: map[string]int64{"TCP": 1}, + }) + require.NoError(t, err) + + _, err = agent3.PostStats(context.Background(), &agentsdk.Stats{ + TxBytes: 3 + i, RxBytes: 6 + i, + SessionCountVSCode: 12 + i, SessionCountJetBrains: 14 + i, SessionCountReconnectingPTY: 16 + i, SessionCountSSH: 18 + i, + ConnectionCount: 9 + i, ConnectionMedianLatencyMS: 12000, + ConnectionsByProto: map[string]int64{"TCP": 1}, + }) + require.NoError(t, err) + } // then + goldenFile, err := os.ReadFile("testdata/agent-stats.json") require.NoError(t, err) + areMetricsValid := func(collected map[string]int) bool { + out, err := json.MarshalIndent(collected, " ", " ") + require.NoError(t, err) + os.WriteFile("testdata/agent-stats.json", out, 0644) + return string(goldenFile) == string(out) + } - collectedMetrics := map[string]struct{}{} + collected := map[string]int{} + var executionSeconds bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -413,7 +428,7 @@ func TestAgentStats(t *testing.T) { for _, metric := range metrics { switch metric.GetName() { case "coderd_prometheusmetrics_agentstats_execution_seconds": - collectedMetrics[metric.GetName()] = struct{}{} + executionSeconds = true case "coderd_agentstats_connection_count", "coderd_agentstats_connection_median_latency_seconds", "coderd_agentstats_rx_bytes", @@ -422,16 +437,35 @@ func TestAgentStats(t *testing.T) { "coderd_agentstats_session_count_reconnecting_pty", "coderd_agentstats_session_count_ssh", "coderd_agentstats_session_count_vscode": - collectedMetrics[metric.GetName()] = struct{}{} - assert.Equal(t, "example", metric.Metric[0].Label[0].GetValue()) // Agent name - assert.Equal(t, "testuser", metric.Metric[0].Label[1].GetValue()) // Username - assert.Equal(t, workspace.Name, metric.Metric[0].Label[2].GetValue()) // Workspace name - assert.NotZero(t, int(metric.Metric[0].Gauge.GetValue()), metric.GetName()) // Metric value + for _, m := range metric.Metric { + // username:workspace:agent:metric = value + collected[m.Label[1].GetValue()+":"+m.Label[2].GetValue()+":"+m.Label[0].GetValue()+":"+metric.GetName()] = int(m.Gauge.GetValue()) + } default: require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } + return executionSeconds && areMetricsValid(collected) + }, testutil.WaitLong, testutil.IntervalMedium) +} + +func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) (*agentsdk.Client, codersdk.Workspace) { + authToken := uuid.NewString() + + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: echo.ProvisionApplyWithAgent(authToken), + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) { + cwr.Name = fmt.Sprintf("workspace-%d", workspaceNum) + }) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + + agentClient := agentsdk.New(client.URL) + agentClient.SetSessionToken(authToken) - return len(collectedMetrics) == 9 - }, testutil.WaitShort, testutil.IntervalFast, "collected metrics: %v", collectedMetrics) + return agentClient, workspace } diff --git a/coderd/prometheusmetrics/testdata/agent-stats.json b/coderd/prometheusmetrics/testdata/agent-stats.json new file mode 100644 index 0000000000000..fc4467b5e3e7c --- /dev/null +++ b/coderd/prometheusmetrics/testdata/agent-stats.json @@ -0,0 +1,26 @@ +{ + "testuser:workspace-1:example:coderd_agentstats_connection_count": 9, + "testuser:workspace-1:example:coderd_agentstats_connection_median_latency_seconds": 8, + "testuser:workspace-1:example:coderd_agentstats_rx_bytes": 9, + "testuser:workspace-1:example:coderd_agentstats_session_count_jetbrains": 6, + "testuser:workspace-1:example:coderd_agentstats_session_count_reconnecting_pty": 7, + "testuser:workspace-1:example:coderd_agentstats_session_count_ssh": 8, + "testuser:workspace-1:example:coderd_agentstats_session_count_vscode": 5, + "testuser:workspace-1:example:coderd_agentstats_tx_bytes": 6, + "testuser:workspace-2:example:coderd_agentstats_connection_count": 10, + "testuser:workspace-2:example:coderd_agentstats_connection_median_latency_seconds": 10, + "testuser:workspace-2:example:coderd_agentstats_rx_bytes": 15, + "testuser:workspace-2:example:coderd_agentstats_session_count_jetbrains": 10, + "testuser:workspace-2:example:coderd_agentstats_session_count_reconnecting_pty": 12, + "testuser:workspace-2:example:coderd_agentstats_session_count_ssh": 14, + "testuser:workspace-2:example:coderd_agentstats_session_count_vscode": 8, + "testuser:workspace-2:example:coderd_agentstats_tx_bytes": 9, + "testuser:workspace-3:example:coderd_agentstats_connection_count": 11, + "testuser:workspace-3:example:coderd_agentstats_connection_median_latency_seconds": 12, + "testuser:workspace-3:example:coderd_agentstats_rx_bytes": 21, + "testuser:workspace-3:example:coderd_agentstats_session_count_jetbrains": 16, + "testuser:workspace-3:example:coderd_agentstats_session_count_reconnecting_pty": 18, + "testuser:workspace-3:example:coderd_agentstats_session_count_ssh": 20, + "testuser:workspace-3:example:coderd_agentstats_session_count_vscode": 14, + "testuser:workspace-3:example:coderd_agentstats_tx_bytes": 12 + } \ No newline at end of file From 787816780b4c1827540d6dbf1ac9c5431a2b12e9 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 13:09:15 +0200 Subject: [PATCH 28/40] fix tests --- .../prometheusmetrics_test.go | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index fd341089733dc..e63a1fd9a1ee8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -373,11 +373,7 @@ func TestAgentStats(t *testing.T) { registry := prometheus.NewRegistry() // given - cancel, err := prometheusmetrics.AgentStats(context.Background(), slogtest.Make(t, nil), registry, db, time.Now(), time.Second) - require.NoError(t, err) - t.Cleanup(cancel) - - // when + var err error var i int64 for i = 0; i < 3; i++ { _, err = agent1.PostStats(context.Background(), &agentsdk.Stats{ @@ -405,19 +401,22 @@ func TestAgentStats(t *testing.T) { require.NoError(t, err) } + // when + // + // Set initialCreateAfter to some time in the past, so that AgentStats would include all above PostStats, + // and it doesn't depend on the real time. + cancel, err := prometheusmetrics.AgentStats(context.Background(), slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) + require.NoError(t, err) + t.Cleanup(cancel) + // then goldenFile, err := os.ReadFile("testdata/agent-stats.json") require.NoError(t, err) - areMetricsValid := func(collected map[string]int) bool { - out, err := json.MarshalIndent(collected, " ", " ") - require.NoError(t, err) - os.WriteFile("testdata/agent-stats.json", out, 0644) - return string(goldenFile) == string(out) - } collected := map[string]int{} + var out []byte var executionSeconds bool - require.Eventually(t, func() bool { + assert.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -445,8 +444,15 @@ func TestAgentStats(t *testing.T) { require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return executionSeconds && areMetricsValid(collected) - }, testutil.WaitLong, testutil.IntervalMedium) + + out, err = json.MarshalIndent(collected, " ", " ") + require.NoError(t, err) + + return executionSeconds && string(goldenFile) == string(out) + }, testutil.WaitShort, testutil.IntervalFast) + + // Keep this assertion, so that "go test" can print differences instead of "Condition never satisfied" + assert.Equal(t, string(goldenFile), string(out)) } func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) (*agentsdk.Client, codersdk.Workspace) { From d9e4903d5745c3ee0eb1c90e7744810862936d76 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 13:16:52 +0200 Subject: [PATCH 29/40] WIP --- coderd/prometheusmetrics/prometheusmetrics_test.go | 11 +++++++---- coderd/prometheusmetrics/testdata/agent-stats.json | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e63a1fd9a1ee8..e6453c6bb990e 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -372,11 +372,14 @@ func TestAgentStats(t *testing.T) { registry := prometheus.NewRegistry() + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() + // given var err error var i int64 for i = 0; i < 3; i++ { - _, err = agent1.PostStats(context.Background(), &agentsdk.Stats{ + _, err = agent1.PostStats(ctx, &agentsdk.Stats{ TxBytes: 1 + i, RxBytes: 2 + i, SessionCountVSCode: 3 + i, SessionCountJetBrains: 4 + i, SessionCountReconnectingPTY: 5 + i, SessionCountSSH: 6 + i, ConnectionCount: 7 + i, ConnectionMedianLatencyMS: 8000, @@ -384,7 +387,7 @@ func TestAgentStats(t *testing.T) { }) require.NoError(t, err) - _, err = agent2.PostStats(context.Background(), &agentsdk.Stats{ + _, err = agent2.PostStats(ctx, &agentsdk.Stats{ TxBytes: 2 + i, RxBytes: 4 + i, SessionCountVSCode: 6 + i, SessionCountJetBrains: 8 + i, SessionCountReconnectingPTY: 10 + i, SessionCountSSH: 12 + i, ConnectionCount: 8 + i, ConnectionMedianLatencyMS: 10000, @@ -392,7 +395,7 @@ func TestAgentStats(t *testing.T) { }) require.NoError(t, err) - _, err = agent3.PostStats(context.Background(), &agentsdk.Stats{ + _, err = agent3.PostStats(ctx, &agentsdk.Stats{ TxBytes: 3 + i, RxBytes: 6 + i, SessionCountVSCode: 12 + i, SessionCountJetBrains: 14 + i, SessionCountReconnectingPTY: 16 + i, SessionCountSSH: 18 + i, ConnectionCount: 9 + i, ConnectionMedianLatencyMS: 12000, @@ -405,7 +408,7 @@ func TestAgentStats(t *testing.T) { // // Set initialCreateAfter to some time in the past, so that AgentStats would include all above PostStats, // and it doesn't depend on the real time. - cancel, err := prometheusmetrics.AgentStats(context.Background(), slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) + cancel, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) require.NoError(t, err) t.Cleanup(cancel) diff --git a/coderd/prometheusmetrics/testdata/agent-stats.json b/coderd/prometheusmetrics/testdata/agent-stats.json index fc4467b5e3e7c..92c9f80c5200b 100644 --- a/coderd/prometheusmetrics/testdata/agent-stats.json +++ b/coderd/prometheusmetrics/testdata/agent-stats.json @@ -23,4 +23,4 @@ "testuser:workspace-3:example:coderd_agentstats_session_count_ssh": 20, "testuser:workspace-3:example:coderd_agentstats_session_count_vscode": 14, "testuser:workspace-3:example:coderd_agentstats_tx_bytes": 12 - } \ No newline at end of file +} From 0d37c85e4b6a4c0f1e1fe4827a46c833dbaba0de Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 13:21:45 +0200 Subject: [PATCH 30/40] fix json --- coderd/prometheusmetrics/prometheusmetrics_test.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e6453c6bb990e..ad7e1ea6c12da 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -1,6 +1,7 @@ package prometheusmetrics_test import ( + "bytes" "context" "database/sql" "encoding/json" @@ -415,6 +416,7 @@ func TestAgentStats(t *testing.T) { // then goldenFile, err := os.ReadFile("testdata/agent-stats.json") require.NoError(t, err) + goldenFile = bytes.TrimSpace(goldenFile) collected := map[string]int{} var out []byte @@ -448,7 +450,7 @@ func TestAgentStats(t *testing.T) { } } - out, err = json.MarshalIndent(collected, " ", " ") + out, err = json.MarshalIndent(collected, "", " ") require.NoError(t, err) return executionSeconds && string(goldenFile) == string(out) From f752c6ffdd96b1dfb88696ed2379c99b907e14d6 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 13:24:06 +0200 Subject: [PATCH 31/40] Rx Tx bytes --- coderd/database/dbfake/databasefake.go | 4 ++-- coderd/database/queries.sql.go | 14 +++++++------- coderd/database/queries/workspaceagentstats.sql | 6 +++--- coderd/prometheusmetrics/prometheusmetrics.go | 4 ++-- docs/cli/server.md | 2 +- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/coderd/database/dbfake/databasefake.go b/coderd/database/dbfake/databasefake.go index adebfefbdb51f..7a26f3d39cad1 100644 --- a/coderd/database/dbfake/databasefake.go +++ b/coderd/database/dbfake/databasefake.go @@ -4031,8 +4031,8 @@ func (q *fakeQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, creat // Tx, Rx metrics for _, agentStat := range agentStatsCreatedAfter { stat := statByAgent[agentStat.AgentID] - stat.WorkspaceRxBytes += agentStat.RxBytes - stat.WorkspaceTxBytes += agentStat.TxBytes + stat.RxBytes += agentStat.RxBytes + stat.TxBytes += agentStat.TxBytes statByAgent[agentStat.AgentID] = stat } diff --git a/coderd/database/queries.sql.go b/coderd/database/queries.sql.go index ffdc5bcfb156b..29cc385db5ddf 100644 --- a/coderd/database/queries.sql.go +++ b/coderd/database/queries.sql.go @@ -6380,8 +6380,8 @@ WITH agent_stats AS ( user_id, agent_id, workspace_id, - coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes, - coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes + coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes, + coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes FROM workspace_agent_stats WHERE workspace_agent_stats.created_at > $1 GROUP BY user_id, agent_id, workspace_id @@ -6404,7 +6404,7 @@ WITH agent_stats AS ( GROUP BY a.user_id, a.agent_id, a.workspace_id ) SELECT - users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms FROM @@ -6431,8 +6431,8 @@ type GetWorkspaceAgentStatsAndLabelsRow struct { Username string `db:"username" json:"username"` AgentName string `db:"agent_name" json:"agent_name"` WorkspaceName string `db:"workspace_name" json:"workspace_name"` - WorkspaceRxBytes int64 `db:"workspace_rx_bytes" json:"workspace_rx_bytes"` - WorkspaceTxBytes int64 `db:"workspace_tx_bytes" json:"workspace_tx_bytes"` + RxBytes int64 `db:"rx_bytes" json:"rx_bytes"` + TxBytes int64 `db:"tx_bytes" json:"tx_bytes"` SessionCountVSCode int64 `db:"session_count_vscode" json:"session_count_vscode"` SessionCountSSH int64 `db:"session_count_ssh" json:"session_count_ssh"` SessionCountJetBrains int64 `db:"session_count_jetbrains" json:"session_count_jetbrains"` @@ -6454,8 +6454,8 @@ func (q *sqlQuerier) GetWorkspaceAgentStatsAndLabels(ctx context.Context, create &i.Username, &i.AgentName, &i.WorkspaceName, - &i.WorkspaceRxBytes, - &i.WorkspaceTxBytes, + &i.RxBytes, + &i.TxBytes, &i.SessionCountVSCode, &i.SessionCountSSH, &i.SessionCountJetBrains, diff --git a/coderd/database/queries/workspaceagentstats.sql b/coderd/database/queries/workspaceagentstats.sql index 520e003b10fea..4432fbcdaf663 100644 --- a/coderd/database/queries/workspaceagentstats.sql +++ b/coderd/database/queries/workspaceagentstats.sql @@ -110,8 +110,8 @@ WITH agent_stats AS ( user_id, agent_id, workspace_id, - coalesce(SUM(rx_bytes), 0)::bigint AS workspace_rx_bytes, - coalesce(SUM(tx_bytes), 0)::bigint AS workspace_tx_bytes + coalesce(SUM(rx_bytes), 0)::bigint AS rx_bytes, + coalesce(SUM(tx_bytes), 0)::bigint AS tx_bytes FROM workspace_agent_stats WHERE workspace_agent_stats.created_at > $1 GROUP BY user_id, agent_id, workspace_id @@ -134,7 +134,7 @@ WITH agent_stats AS ( GROUP BY a.user_id, a.agent_id, a.workspace_id ) SELECT - users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, workspace_rx_bytes, workspace_tx_bytes, + users.username, workspace_agents.name AS agent_name, workspaces.name AS workspace_name, rx_bytes, tx_bytes, session_count_vscode, session_count_ssh, session_count_jetbrains, session_count_reconnecting_pty, connection_count, connection_median_latency_ms FROM diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 7abd129619a53..b199b73cfe4be 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -419,8 +419,8 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R logger.Error(ctx, "can't get agent stats", slog.Error(err)) } else { for _, agentStat := range stats { - agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceRxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) - agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.WorkspaceTxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsRxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.RxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) + agentStatsTxBytesGauge.WithLabelValues(VectorOperationAdd, float64(agentStat.TxBytes), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsConnectionCountGauge.WithLabelValues(VectorOperationSet, float64(agentStat.ConnectionCount), agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) agentStatsConnectionMedianLatencyGauge.WithLabelValues(VectorOperationSet, agentStat.ConnectionMedianLatencyMS/1000.0 /* (to seconds) */, agentStat.AgentName, agentStat.Username, agentStat.WorkspaceName) diff --git a/docs/cli/server.md b/docs/cli/server.md index 5de02886e6bb0..e9a382dc598dd 100644 --- a/docs/cli/server.md +++ b/docs/cli/server.md @@ -563,7 +563,7 @@ The bind address to serve prometheus metrics. | Environment | $CODER_PROMETHEUS_COLLECT_AGENT_STATS | | YAML | introspection.prometheus.collect_agent_stats | -Collect agent stats (may increase charges for metrics storage) . +Collect agent stats (may increase charges for metrics storage). ### --prometheus-enable From 9c7aef8064dbbcb7df2a79f76a67cad9dd1626d3 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 13:51:07 +0200 Subject: [PATCH 32/40] CloseFunc --- coderd/prometheusmetrics/prometheusmetrics.go | 12 +++++++++--- .../prometheusmetrics/prometheusmetrics_test.go | 15 +++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index b199b73cfe4be..dcbd2d9e4f79c 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -293,7 +293,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return cancelFunc, nil } -func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (context.CancelFunc, error) { +func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) { if duration == 0 { duration = 1 * time.Minute } @@ -398,10 +398,13 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R return nil, err } - createdAfter := initialCreateAfter ctx, cancelFunc := context.WithCancel(ctx) + done := make(chan struct{}) + + createdAfter := initialCreateAfter ticker := time.NewTicker(duration) go func() { + defer close(done) defer ticker.Stop() for { select { @@ -452,5 +455,8 @@ func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.R ticker.Reset(duration) } }() - return cancelFunc, nil + return func() { + cancelFunc() + <-done + }, nil } diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index ad7e1ea6c12da..3b78b04802c4d 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -367,9 +367,9 @@ func TestAgentStats(t *testing.T) { user := coderdtest.CreateFirstUser(t, client) - agent1, _ := prepareWorkspaceAndAgent(t, client, user, 1) - agent2, _ := prepareWorkspaceAndAgent(t, client, user, 2) - agent3, _ := prepareWorkspaceAndAgent(t, client, user, 3) + agent1 := prepareWorkspaceAndAgent(t, client, user, 1) + agent2 := prepareWorkspaceAndAgent(t, client, user, 2) + agent3 := prepareWorkspaceAndAgent(t, client, user, 3) registry := prometheus.NewRegistry() @@ -409,9 +409,9 @@ func TestAgentStats(t *testing.T) { // // Set initialCreateAfter to some time in the past, so that AgentStats would include all above PostStats, // and it doesn't depend on the real time. - cancel, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) + closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) require.NoError(t, err) - t.Cleanup(cancel) + t.Cleanup(closeFunc) // then goldenFile, err := os.ReadFile("testdata/agent-stats.json") @@ -460,7 +460,7 @@ func TestAgentStats(t *testing.T) { assert.Equal(t, string(goldenFile), string(out)) } -func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) (*agentsdk.Client, codersdk.Workspace) { +func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) *agentsdk.Client { authToken := uuid.NewString() version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ @@ -477,6 +477,5 @@ func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user coders agentClient := agentsdk.New(client.URL) agentClient.SetSessionToken(authToken) - - return agentClient, workspace + return agentClient } From 52905717a2658a7eb976df6dd1153c223ba8cbf6 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 13:57:06 +0200 Subject: [PATCH 33/40] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 29 +++++++++++++++---- .../prometheusmetrics_test.go | 12 ++++---- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index dcbd2d9e4f79c..512b737c94a43 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -23,7 +23,7 @@ import ( ) // ActiveUsers tracks the number of users that have authenticated within the past hour. -func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { +func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { if duration == 0 { duration = 5 * time.Minute } @@ -40,8 +40,10 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab } ctx, cancelFunc := context.WithCancel(ctx) + done := make(chan struct{}) ticker := time.NewTicker(duration) go func() { + defer close(done) defer ticker.Stop() for { select { @@ -61,11 +63,14 @@ func ActiveUsers(ctx context.Context, registerer prometheus.Registerer, db datab gauge.Set(float64(len(distinctUsers))) } }() - return cancelFunc, nil + return func() { + cancelFunc() + <-done + }, nil } // Workspaces tracks the total number of workspaces with labels on status. -func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { +func Workspaces(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (func(), error) { if duration == 0 { duration = 5 * time.Minute } @@ -85,8 +90,11 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa gauge.WithLabelValues("pending").Set(0) ctx, cancelFunc := context.WithCancel(ctx) + done := make(chan struct{}) + ticker := time.NewTicker(duration) go func() { + defer close(done) defer ticker.Stop() for { select { @@ -115,11 +123,14 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } } }() - return cancelFunc, nil + return func() { + cancelFunc() + <-done + }, nil } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (func(), error) { if duration == 0 { duration = 1 * time.Minute } @@ -182,8 +193,11 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) + done := make(chan struct{}) + ticker := time.NewTicker(duration) go func() { + defer close(done) defer ticker.Stop() for { select { @@ -290,7 +304,10 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis ticker.Reset(duration) } }() - return cancelFunc, nil + return func() { + cancelFunc() + <-done + }, nil } func AgentStats(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, initialCreateAfter time.Time, duration time.Duration) (func(), error) { diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 3b78b04802c4d..9021bd3da111e 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -90,9 +90,9 @@ func TestActiveUsers(t *testing.T) { t.Run(tc.Name, func(t *testing.T) { t.Parallel() registry := prometheus.NewRegistry() - cancel, err := prometheusmetrics.ActiveUsers(context.Background(), registry, tc.Database(t), time.Millisecond) + closeFunc, err := prometheusmetrics.ActiveUsers(context.Background(), registry, tc.Database(t), time.Millisecond) require.NoError(t, err) - t.Cleanup(cancel) + t.Cleanup(closeFunc) require.Eventually(t, func() bool { metrics, err := registry.Gather() @@ -222,9 +222,9 @@ func TestWorkspaces(t *testing.T) { t.Run(tc.Name, func(t *testing.T) { t.Parallel() registry := prometheus.NewRegistry() - cancel, err := prometheusmetrics.Workspaces(context.Background(), registry, tc.Database(), time.Millisecond) + closeFunc, err := prometheusmetrics.Workspaces(context.Background(), registry, tc.Database(), time.Millisecond) require.NoError(t, err) - t.Cleanup(cancel) + t.Cleanup(closeFunc) require.Eventually(t, func() bool { metrics, err := registry.Gather() @@ -306,8 +306,8 @@ func TestAgents(t *testing.T) { registry := prometheus.NewRegistry() // when - cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) - t.Cleanup(cancel) + closeFunc, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + t.Cleanup(closeFunc) // then require.NoError(t, err) From 1cbe59bfa8870f4683fb7ab808dab4a71cf71d3f Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 14:19:02 +0200 Subject: [PATCH 34/40] fix --- coderd/prometheusmetrics/prometheusmetrics_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 9021bd3da111e..fd6aa014906d1 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -307,11 +307,10 @@ func TestAgents(t *testing.T) { // when closeFunc, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + require.NoError(t, err) t.Cleanup(closeFunc) // then - require.NoError(t, err) - var agentsUp bool var agentsConnections bool var agentsApps bool From f8f11ebef255e61740405a41e7fdc46f7a3cdbae Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 14:43:20 +0200 Subject: [PATCH 35/40] Fixes --- coderd/prometheusmetrics/prometheusmetrics.go | 3 ++- coderd/prometheusmetrics/prometheusmetrics_test.go | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 512b737c94a43..cfc64122cd3d5 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -191,8 +191,9 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + ctx, cancelFunc := context.WithCancel(ctx) // nolint:gocritic // Prometheus must collect metrics for all Coder users. - ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) + ctx = dbauthz.AsSystemRestricted(ctx) done := make(chan struct{}) ticker := time.NewTicker(duration) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index fd6aa014906d1..ef06e696539a3 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -305,8 +305,11 @@ func TestAgents(t *testing.T) { agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests registry := prometheus.NewRegistry() + ctx, cancelFunc := context.WithCancel(context.Background()) + defer cancelFunc() + // when - closeFunc, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + closeFunc, err := prometheusmetrics.Agents(ctx, slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, 50*time.Millisecond) require.NoError(t, err) t.Cleanup(closeFunc) From 4ffae113c8e93df41214ec3f4fd45d02e977bd8b Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 14:56:42 +0200 Subject: [PATCH 36/40] fix --- coderd/prometheusmetrics/prometheusmetrics_test.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index ef06e696539a3..6424e87daac82 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -309,7 +309,7 @@ func TestAgents(t *testing.T) { defer cancelFunc() // when - closeFunc, err := prometheusmetrics.Agents(ctx, slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, 50*time.Millisecond) + closeFunc, err := prometheusmetrics.Agents(ctx, slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) require.NoError(t, err) t.Cleanup(closeFunc) @@ -413,7 +413,6 @@ func TestAgentStats(t *testing.T) { // and it doesn't depend on the real time. closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) require.NoError(t, err) - t.Cleanup(closeFunc) // then goldenFile, err := os.ReadFile("testdata/agent-stats.json") @@ -460,6 +459,8 @@ func TestAgentStats(t *testing.T) { // Keep this assertion, so that "go test" can print differences instead of "Condition never satisfied" assert.Equal(t, string(goldenFile), string(out)) + + closeFunc() } func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) *agentsdk.Client { From 7ba16b58525e12474af7ef3541f8d65026961d52 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 15:02:21 +0200 Subject: [PATCH 37/40] fix: IgnoreErrors --- coderd/prometheusmetrics/prometheusmetrics_test.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 6424e87daac82..40e03d6765831 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -309,7 +309,9 @@ func TestAgents(t *testing.T) { defer cancelFunc() // when - closeFunc, err := prometheusmetrics.Agents(ctx, slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + closeFunc, err := prometheusmetrics.Agents(ctx, slogtest.Make(t, &slogtest.Options{ + IgnoreErrors: true, + }), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) require.NoError(t, err) t.Cleanup(closeFunc) @@ -411,7 +413,9 @@ func TestAgentStats(t *testing.T) { // // Set initialCreateAfter to some time in the past, so that AgentStats would include all above PostStats, // and it doesn't depend on the real time. - closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, nil), registry, db, time.Now().Add(-time.Minute), time.Millisecond) + closeFunc, err := prometheusmetrics.AgentStats(ctx, slogtest.Make(t, &slogtest.Options{ + IgnoreErrors: true, + }), registry, db, time.Now().Add(-time.Minute), time.Millisecond) require.NoError(t, err) // then From 2a4c67457869fb92f923fb997423027fed9f9f2a Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 15:18:01 +0200 Subject: [PATCH 38/40] Fix: Windows --- coderd/prometheusmetrics/prometheusmetrics_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 40e03d6765831..105c9f9fc533f 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -457,6 +457,7 @@ func TestAgentStats(t *testing.T) { out, err = json.MarshalIndent(collected, "", " ") require.NoError(t, err) + out = bytes.ReplaceAll(out, []byte("\r\n"), []byte("\n")) // comparison fix for Windows return executionSeconds && string(goldenFile) == string(out) }, testutil.WaitShort, testutil.IntervalFast) From 201da835ad76244dfe4010bf6d2eafcb1b5f06d8 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 15:42:18 +0200 Subject: [PATCH 39/40] fix --- coderd/prometheusmetrics/prometheusmetrics_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 105c9f9fc533f..43ced298b208e 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -457,7 +457,7 @@ func TestAgentStats(t *testing.T) { out, err = json.MarshalIndent(collected, "", " ") require.NoError(t, err) - out = bytes.ReplaceAll(out, []byte("\r\n"), []byte("\n")) // comparison fix for Windows + out = bytes.ReplaceAll(out, []byte{'\r', '\n'}, []byte{'\n'}) // comparison fix for Windows return executionSeconds && string(goldenFile) == string(out) }, testutil.WaitShort, testutil.IntervalFast) From ba52c453aba0b148411d308e1e3de92995250243 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 14 Apr 2023 15:50:09 +0200 Subject: [PATCH 40/40] reflect.DeepEquals --- .../prometheusmetrics_test.go | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 43ced298b208e..56d32cc6dd6de 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -1,12 +1,12 @@ package prometheusmetrics_test import ( - "bytes" "context" "database/sql" "encoding/json" "fmt" "os" + "reflect" "sync/atomic" "testing" "time" @@ -417,14 +417,16 @@ func TestAgentStats(t *testing.T) { IgnoreErrors: true, }), registry, db, time.Now().Add(-time.Minute), time.Millisecond) require.NoError(t, err) + t.Cleanup(closeFunc) // then goldenFile, err := os.ReadFile("testdata/agent-stats.json") require.NoError(t, err) - goldenFile = bytes.TrimSpace(goldenFile) + golden := map[string]int{} + err = json.Unmarshal(goldenFile, &golden) + require.NoError(t, err) collected := map[string]int{} - var out []byte var executionSeconds bool assert.Eventually(t, func() bool { metrics, err := registry.Gather() @@ -454,18 +456,11 @@ func TestAgentStats(t *testing.T) { require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - - out, err = json.MarshalIndent(collected, "", " ") - require.NoError(t, err) - out = bytes.ReplaceAll(out, []byte{'\r', '\n'}, []byte{'\n'}) // comparison fix for Windows - - return executionSeconds && string(goldenFile) == string(out) + return executionSeconds && reflect.DeepEqual(golden, collected) }, testutil.WaitShort, testutil.IntervalFast) // Keep this assertion, so that "go test" can print differences instead of "Condition never satisfied" - assert.Equal(t, string(goldenFile), string(out)) - - closeFunc() + assert.EqualValues(t, golden, collected) } func prepareWorkspaceAndAgent(t *testing.T, client *codersdk.Client, user codersdk.CreateFirstUserResponse, workspaceNum int) *agentsdk.Client {