From 8d4e67d3a07ff3e40d62139fcd71f8488aa31884 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Mon, 3 Apr 2023 21:43:18 +0200 Subject: [PATCH 01/15] WIP --- cli/server.go | 6 ++ coderd/prometheusmetrics/prometheusmetrics.go | 75 +++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/cli/server.go b/cli/server.go index b6fa7c31b647c..5ce53af185e1d 100644 --- a/cli/server.go +++ b/cli/server.go @@ -868,6 +868,12 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, options.Database, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 536522bf73e04..2c1c36dddbc34 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -106,3 +106,78 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa }() return cancelFunc, nil } + +// Agents tracks the total number of workspaces with labels on status. +func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { + if duration == 0 { + duration = 15 * time.Second // TODO 5 * time.Minute + } + + agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection", + Help: "The agent connection with a status.", + }, []string{"agent_name", "workspace_name", "status"}) + err := registerer.Register(agentsConnectionGauge) + if err != nil { + return nil, err + } + + agentsUserLatenciesHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "user_latencies_seconds", + Help: "The user's agent latency in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }, []string{"agent_id", "workspace", "connection_type", "ide"}) + err = registerer.Register(agentsUserLatenciesHistogram) + if err != nil { + return nil, err + } + + ctx, cancelFunc := context.WithCancel(ctx) + ticker := time.NewTicker(duration) + go func() { + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + } + + // FIXME Optimize this routine: SQL db calls + + builds, err := db.GetLatestWorkspaceBuilds(ctx) + if err != nil { + continue + } + + agentsConnectionGauge.Reset() + for _, build := range builds { + workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) + if err != nil { + continue + } + + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) + if err != nil { + continue + } + + if len(agents) == 0 { + continue + } + + for _, agent := range agents { + connectionStatus := agent.Status(6 * time.Second) + + // FIXME AgentInactiveDisconnectTimeout + agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + } + } + } + }() + return cancelFunc, nil +} From 9ad09b20f8e830c346eb3e41c4eaf4c9e7d29475 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Tue, 4 Apr 2023 19:47:27 +0200 Subject: [PATCH 02/15] WIP --- cli/server.go | 20 ++++--- coderd/prometheusmetrics/prometheusmetrics.go | 60 +++++++++++++++++-- 2 files changed, 66 insertions(+), 14 deletions(-) diff --git a/cli/server.go b/cli/server.go index 30ceb06aefc46..b3961b52e4e53 100644 --- a/cli/server.go +++ b/cli/server.go @@ -849,6 +849,16 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer options.Telemetry.Close() } + databaseStoreWithoutAuth := options.Database + + // We use a separate coderAPICloser so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, coderAPICloser, err := newAPI(ctx, options) + if err != nil { + return xerrors.Errorf("create coder API: %w", err) + } + // This prevents the pprof import from being accidentally deleted. _ = pprof.Handler if cfg.Pprof.Enable { @@ -871,7 +881,7 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, options.Database, 0) + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0) if err != nil { return xerrors.Errorf("register agents prometheus metric: %w", err) } @@ -887,14 +897,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } - // We use a separate coderAPICloser so the Enterprise API - // can have it's own close functions. This is cleaner - // than abstracting the Coder API itself. - coderAPI, coderAPICloser, err := newAPI(ctx, options) - if err != nil { - return xerrors.Errorf("create coder API: %w", err) - } - client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 2c1c36dddbc34..5ce6fbd51f6c5 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,13 +2,20 @@ package prometheusmetrics import ( "context" + "fmt" + "log" + "strconv" + "strings" + "sync/atomic" "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" + "tailscale.com/tailcfg" "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/tailnet" ) // ActiveUsers tracks the number of users that have authenticated within the past hour. @@ -108,7 +115,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 15 * time.Second // TODO 5 * time.Minute } @@ -124,23 +131,26 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S return nil, err } - agentsUserLatenciesHistogram := prometheus.NewHistogramVec(prometheus.HistogramOpts{ + agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "user_latencies_seconds", Help: "The user's agent latency in seconds.", - Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, - }, []string{"agent_id", "workspace", "connection_type", "ide"}) - err = registerer.Register(agentsUserLatenciesHistogram) + }, []string{"agent_id", "workspace_name", "derp_region", "preferred"}) + err = registerer.Register(agentsUserLatenciesGauge) if err != nil { return nil, err } + // FIXME connection_type ide + ctx, cancelFunc := context.WithCancel(ctx) ticker := time.NewTicker(duration) go func() { defer ticker.Stop() for { + log.Println("Agents!!!") + select { case <-ctx.Done(): return @@ -151,18 +161,22 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S builds, err := db.GetLatestWorkspaceBuilds(ctx) if err != nil { + log.Println("1", err) continue } agentsConnectionGauge.Reset() + agentsUserLatenciesGauge.Reset() for _, build := range builds { workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) if err != nil { + log.Println("2", err) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) if err != nil { + log.Println("3", err) continue } @@ -170,11 +184,47 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S continue } + // FIXME publish workspace even if no agents + for _, agent := range agents { connectionStatus := agent.Status(6 * time.Second) // FIXME AgentInactiveDisconnectTimeout + log.Println("with value " + agent.Name) agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + + node := (*coordinator.Load()).Node(agent.ID) + if node != nil { + log.Println("coordinator") + + for rawRegion, latency := range node.DERPLatency { + log.Println(rawRegion, latency) + + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err) + } + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + log.Println(region, latency) + agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + } + } else { + log.Println("node is null") + } + + // FIXME publish agent even if DERP is missing + // FIXME IDE? } } } From 440657c16195948c17a72751193043ce44a15531 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 10:43:30 +0200 Subject: [PATCH 03/15] WIP --- coderd/prometheusmetrics/prometheusmetrics.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 5ce6fbd51f6c5..d1af83a4a4439 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -190,6 +190,8 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S connectionStatus := agent.Status(6 * time.Second) // FIXME AgentInactiveDisconnectTimeout + // ? connection_timeout_seconds + // obok latency lifecycle_state log.Println("with value " + agent.Name) agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) @@ -225,6 +227,7 @@ func Agents(ctx context.Context, registerer prometheus.Registerer, db database.S // FIXME publish agent even if DERP is missing // FIXME IDE? + // FIXME agent connection zero } } } From 8764f8975d75ebb45d9e5996fac9b7509c953e33 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:08:27 +0200 Subject: [PATCH 04/15] Agents --- cli/server.go | 33 ++--- coderd/prometheusmetrics/prometheusmetrics.go | 133 ++++++++++-------- 2 files changed, 92 insertions(+), 74 deletions(-) diff --git a/cli/server.go b/cli/server.go index b3961b52e4e53..c93064f34c8ef 100644 --- a/cli/server.go +++ b/cli/server.go @@ -849,16 +849,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. defer options.Telemetry.Close() } - databaseStoreWithoutAuth := options.Database - - // We use a separate coderAPICloser so the Enterprise API - // can have it's own close functions. This is cleaner - // than abstracting the Coder API itself. - coderAPI, coderAPICloser, err := newAPI(ctx, options) - if err != nil { - return xerrors.Errorf("create coder API: %w", err) - } - // This prevents the pprof import from being accidentally deleted. _ = pprof.Handler if cfg.Pprof.Enable { @@ -881,12 +871,6 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. } defer closeWorkspacesFunc() - closeAgentsFunc, err := prometheusmetrics.Agents(ctx, options.PrometheusRegistry, databaseStoreWithoutAuth, &coderAPI.TailnetCoordinator, options.DERPMap, 0) - if err != nil { - return xerrors.Errorf("register agents prometheus metric: %w", err) - } - defer closeAgentsFunc() - //nolint:revive defer serveHandler(ctx, logger, promhttp.InstrumentMetricHandler( options.PrometheusRegistry, promhttp.HandlerFor(options.PrometheusRegistry, promhttp.HandlerOpts{}), @@ -897,6 +881,23 @@ func (r *RootCmd) Server(newAPI func(context.Context, *coderd.Options) (*coderd. options.SwaggerEndpoint = cfg.Swagger.Enable.Value() } + // We use a separate coderAPICloser so the Enterprise API + // can have it's own close functions. This is cleaner + // than abstracting the Coder API itself. + coderAPI, coderAPICloser, err := newAPI(ctx, options) + if err != nil { + return xerrors.Errorf("create coder API: %w", err) + } + + if cfg.Prometheus.Enable { + // Agent metrics require reference to the tailnet coordinator, so must be initiated after Coder API. + closeAgentsFunc, err := prometheusmetrics.Agents(ctx, logger, options.PrometheusRegistry, coderAPI.Database, &coderAPI.TailnetCoordinator, options.DERPMap, coderAPI.Options.AgentInactiveDisconnectTimeout, 0) + if err != nil { + return xerrors.Errorf("register agents prometheus metric: %w", err) + } + defer closeAgentsFunc() + } + client := codersdk.New(localURL) if localURL.Scheme == "https" && isLocalhost(localURL.Hostname()) { // The certificate will likely be self-signed or for a different diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index d1af83a4a4439..31a0b712a1511 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -3,7 +3,6 @@ package prometheusmetrics import ( "context" "fmt" - "log" "strconv" "strings" "sync/atomic" @@ -13,8 +12,11 @@ import ( "github.com/prometheus/client_golang/prometheus" "tailscale.com/tailcfg" + "cdr.dev/slog" + "github.com/coder/coder/coderd" "github.com/coder/coder/coderd/database" + "github.com/coder/coder/coderd/database/dbauthz" "github.com/coder/coder/tailnet" ) @@ -115,119 +117,134 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa } // Agents tracks the total number of workspaces with labels on status. -func Agents(ctx context.Context, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, duration time.Duration) (context.CancelFunc, error) { +func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { duration = 15 * time.Second // TODO 5 * time.Minute } - agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", - Name: "connection", - Help: "The agent connection with a status.", - }, []string{"agent_name", "workspace_name", "status"}) - err := registerer.Register(agentsConnectionGauge) + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"}) + err := registerer.Register(workspaceAgentsGauge) if err != nil { return nil, err } - agentsUserLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", - Name: "user_latencies_seconds", - Help: "The user's agent latency in seconds.", - }, []string{"agent_id", "workspace_name", "derp_region", "preferred"}) - err = registerer.Register(agentsUserLatenciesGauge) + Name: "connections", + Help: "Agent connections with statuses.", + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) + err = registerer.Register(agentsConnectionGauge) if err != nil { return nil, err } - // FIXME connection_type ide + agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "connection_latencies_seconds", + Help: "Agent connection latencies in seconds.", + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}) + err = registerer.Register(agentsConnectionLatenciesGauge) + if err != nil { + return nil, err + } - ctx, cancelFunc := context.WithCancel(ctx) + // nolint:gocritic // Prometheus must collect metrics for all Coder users. + ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) go func() { defer ticker.Stop() for { - log.Println("Agents!!!") - select { case <-ctx.Done(): return case <-ticker.C: } - // FIXME Optimize this routine: SQL db calls + logger.Info(ctx, "Collect agent metrics now") - builds, err := db.GetLatestWorkspaceBuilds(ctx) + workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ + AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), + }) if err != nil { - log.Println("1", err) + logger.Error(ctx, "can't get workspace rows", slog.Error(err)) continue } + workspaceAgentsGauge.Reset() agentsConnectionGauge.Reset() - agentsUserLatenciesGauge.Reset() - for _, build := range builds { - workspace, err := db.GetWorkspaceByID(ctx, build.WorkspaceID) + agentsConnectionLatenciesGauge.Reset() + + for _, workspace := range workspaceRows { + user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { - log.Println("2", err) + logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } - agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, build.WorkspaceID) + agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { - log.Println("3", err) + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } if len(agents) == 0 { + logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name)) + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } - // FIXME publish workspace even if no agents - for _, agent := range agents { - connectionStatus := agent.Status(6 * time.Second) - - // FIXME AgentInactiveDisconnectTimeout - // ? connection_timeout_seconds - // obok latency lifecycle_state - log.Println("with value " + agent.Name) - agentsConnectionGauge.WithLabelValues(agent.Name, workspace.Name, string(connectionStatus.Status)).Set(1) + // Collect information about agents + workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) + + tailnetNode := "unknown" if node != nil { - log.Println("coordinator") + tailnetNode = node.ID.String() + } - for rawRegion, latency := range node.DERPLatency { - log.Println(rawRegion, latency) + agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) - regionParts := strings.SplitN(rawRegion, "-", 2) - regionID, err := strconv.Atoi(regionParts[0]) - if err != nil { - continue // xerrors.Errorf("convert derp region id %q: %w", rawRegion, err) - } - region, found := derpMap.Regions[regionID] - if !found { - // It's possible that a workspace agent is using an old DERPMap - // and reports regions that do not exist. If that's the case, - // report the region as unknown! - region = &tailcfg.DERPRegion{ - RegionID: regionID, - RegionName: fmt.Sprintf("Unnamed %d", regionID), - } - } + if node == nil { + logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name)) + continue + } - log.Println(region, latency) - agentsUserLatenciesGauge.WithLabelValues(agent.Name, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion)) + continue } - } else { - log.Println("node is null") + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } + } + + agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) } - // FIXME publish agent even if DERP is missing // FIXME IDE? - // FIXME agent connection zero + // FIXME connection_type ide } } } From 663b5d5d7a78fce6f6d99943fc30759684a36618 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:31:06 +0200 Subject: [PATCH 05/15] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 93 ++++++++++++------- 1 file changed, 57 insertions(+), 36 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 31a0b712a1511..4e4d828981bc6 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -122,24 +122,24 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis duration = 15 * time.Second // TODO 5 * time.Minute } - workspaceAgentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", }, []string{"username", "workspace_name"}) - err := registerer.Register(workspaceAgentsGauge) + err := registerer.Register(agentsGauge) if err != nil { return nil, err } - agentsConnectionGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) - err = registerer.Register(agentsConnectionGauge) + err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err } @@ -155,6 +155,17 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "apps", + Help: "Agent applications with statuses.", + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"}) + err = registerer.Register(agentsAppsGauge) + if err != nil { + return nil, err + } + // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) @@ -167,7 +178,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis case <-ticker.C: } - logger.Info(ctx, "Collect agent metrics now") + logger.Debug(ctx, "Collect agent metrics now") workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), @@ -177,34 +188,35 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis continue } - workspaceAgentsGauge.Reset() - agentsConnectionGauge.Reset() + agentsGauge.Reset() + agentsConnectionsGauge.Reset() agentsConnectionLatenciesGauge.Reset() + agentsAppsGauge.Reset() for _, workspace := range workspaceRows { user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { - logger.Error(ctx, "can't get user", slog.Error(err), slog.F("user_id", workspace.OwnerID)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { - logger.Error(ctx, "can't get workspace agents", slog.F("workspace_name", workspace.Name), slog.Error(err)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } if len(agents) == 0 { - logger.Info(ctx, "workspace agents are unavailable", slog.F("workspace_name", workspace.Name)) - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) continue } for _, agent := range agents { // Collect information about agents - workspaceAgentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) @@ -214,37 +226,46 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis tailnetNode = node.ID.String() } - agentsConnectionGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) + agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) if node == nil { - logger.Info(ctx, "can't read in-memory node for agent", slog.F("workspace_name", workspace.Name), slog.F("agent_name", agent.Name)) + logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) continue - } + } else { + // Collect information about connection latencies + for rawRegion, latency := range node.DERPLatency { + regionParts := strings.SplitN(rawRegion, "-", 2) + regionID, err := strconv.Atoi(regionParts[0]) + if err != nil { + logger.Error(ctx, "can't convert DERP region", slog.F("agent_id", agent.ID), slog.F("raw_region", rawRegion), slog.Error(err)) + continue + } - // Collect information about connection latencies - for rawRegion, latency := range node.DERPLatency { - regionParts := strings.SplitN(rawRegion, "-", 2) - regionID, err := strconv.Atoi(regionParts[0]) - if err != nil { - logger.Error(ctx, "can't convert DERP region", slog.Error(err), slog.F("agent_name", agent.Name), slog.F("raw_region", rawRegion)) - continue - } - region, found := derpMap.Regions[regionID] - if !found { - // It's possible that a workspace agent is using an old DERPMap - // and reports regions that do not exist. If that's the case, - // report the region as unknown! - region = &tailcfg.DERPRegion{ - RegionID: regionID, - RegionName: fmt.Sprintf("Unnamed %d", regionID), + region, found := derpMap.Regions[regionID] + if !found { + // It's possible that a workspace agent is using an old DERPMap + // and reports regions that do not exist. If that's the case, + // report the region as unknown! + region = &tailcfg.DERPRegion{ + RegionID: regionID, + RegionName: fmt.Sprintf("Unnamed %d", regionID), + } } + + agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) } + } - agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + // Collect information about registered applications + apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) + if err != nil { + logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) + continue } - // FIXME IDE? - // FIXME connection_type ide + for _, app := range apps { + agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1) + } } } } From 63aff5ebf2f2c0695c1289fa557b9bd5fbfdb80f Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:32:49 +0200 Subject: [PATCH 06/15] 1min --- coderd/prometheusmetrics/prometheusmetrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 4e4d828981bc6..a9dc1933addc1 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -119,7 +119,7 @@ func Workspaces(ctx context.Context, registerer prometheus.Registerer, db databa // Agents tracks the total number of workspaces with labels on status. func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Registerer, db database.Store, coordinator *atomic.Pointer[tailnet.Coordinator], derpMap *tailcfg.DERPMap, agentInactiveDisconnectTimeout, duration time.Duration) (context.CancelFunc, error) { if duration == 0 { - duration = 15 * time.Second // TODO 5 * time.Minute + duration = 1 * time.Minute } agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ From 390548159b6fc8cf9d9f10b02e276384ff974b7d Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 12:45:16 +0200 Subject: [PATCH 07/15] fix --- coderd/prometheusmetrics/prometheusmetrics.go | 1 - 1 file changed, 1 deletion(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index a9dc1933addc1..5dddd4d20e388 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -230,7 +230,6 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis if node == nil { logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) - continue } else { // Collect information about connection latencies for rawRegion, latency := range node.DERPLatency { From f8d6f464fe93d3f7799463be9c6e0c9c6c6f5135 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 13:35:45 +0200 Subject: [PATCH 08/15] WIP --- .../prometheusmetrics_test.go | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index a0b375ccf8622..7b828e0514fd8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -6,16 +6,22 @@ import ( "testing" "time" + "sync/atomic" + "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/tailnet" + "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" ) @@ -239,3 +245,37 @@ func TestWorkspaces(t *testing.T) { }) } } + +func TestAgents(t *testing.T) { + t.Parallel() + + // given + db := dbfake.New() + + coordinator := tailnet.NewCoordinator() + coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} + coordinatorPtr.Store(&coordinator) + derpMap := tailnettest.RunDERPAndSTUN(t) + agentInactiveDisconnectTimeout := 1 * time.Hour + registry := prometheus.NewRegistry() + + // when + cancel, err := prometheusmetrics.Agents(context.Background(), slogtest.Make(t, nil), registry, db, &coordinatorPtr, derpMap, agentInactiveDisconnectTimeout, time.Millisecond) + t.Cleanup(cancel) + + // then + require.NoError(t, err) + require.Eventually(t, func() bool { + metrics, err := registry.Gather() + assert.NoError(t, err) + + if len(metrics) < 1 { + return false + } + + for _, metric := range metrics[0].Metric { + panic(metric) + } + return true + }, testutil.WaitShort, testutil.IntervalFast) +} From d487a77372e703abb9be64b38f4a8f8366bddf74 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:18:06 +0200 Subject: [PATCH 09/15] Test --- coderd/prometheusmetrics/prometheusmetrics.go | 4 +- .../prometheusmetrics_test.go | 83 +++++++++++++++++-- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 5dddd4d20e388..222809db00bf8 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -2,6 +2,8 @@ package prometheusmetrics import ( "context" + "database/sql" + "errors" "fmt" "strconv" "strings" @@ -257,7 +259,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis // Collect information about registered applications apps, err := db.GetWorkspaceAppsByAgentID(ctx, agent.ID) - if err != nil { + if err != nil && !errors.Is(err, sql.ErrNoRows) { logger.Error(ctx, "can't get workspace apps", slog.F("agent_id", agent.ID), slog.Error(err)) continue } diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index 7b828e0514fd8..e7829d8a34a18 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -15,11 +15,14 @@ import ( "cdr.dev/slog/sloggers/slogtest" + "github.com/coder/coder/coderd/coderdtest" "github.com/coder/coder/coderd/database" "github.com/coder/coder/coderd/database/dbfake" "github.com/coder/coder/coderd/database/dbgen" "github.com/coder/coder/coderd/prometheusmetrics" "github.com/coder/coder/codersdk" + "github.com/coder/coder/provisioner/echo" + "github.com/coder/coder/provisionersdk/proto" "github.com/coder/coder/tailnet" "github.com/coder/coder/tailnet/tailnettest" "github.com/coder/coder/testutil" @@ -249,14 +252,53 @@ func TestWorkspaces(t *testing.T) { func TestAgents(t *testing.T) { t.Parallel() - // given - db := dbfake.New() + // Build a sample workspace with test agent and fake application + client, _, api := coderdtest.NewWithAPI(t, &coderdtest.Options{IncludeProvisionerDaemon: true}) + db := api.Database + + user := coderdtest.CreateFirstUser(t, client) + version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{ + Parse: echo.ParseComplete, + ProvisionPlan: echo.ProvisionComplete, + ProvisionApply: []*proto.Provision_Response{{ + Type: &proto.Provision_Response_Complete{ + Complete: &proto.Provision_Complete{ + Resources: []*proto.Resource{{ + Name: "example", + Type: "aws_instance", + Agents: []*proto.Agent{{ + Id: uuid.NewString(), + Name: "testagent", + Directory: t.TempDir(), + Auth: &proto.Agent_Token{ + Token: uuid.NewString(), + }, + Apps: []*proto.App{ + { + Slug: "fake-app", + DisplayName: "Fake application", + SharingLevel: proto.AppSharingLevel_OWNER, + // Hopefully this IP and port doesn't exist. + Url: "http://127.1.0.1:65535", + }, + }, + }}, + }}, + }, + }, + }}, + }) + template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID) + coderdtest.AwaitTemplateVersionJob(t, client, version.ID) + workspace := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID) + coderdtest.AwaitWorkspaceBuildJob(t, client, workspace.LatestBuild.ID) + // given coordinator := tailnet.NewCoordinator() coordinatorPtr := atomic.Pointer[tailnet.Coordinator]{} coordinatorPtr.Store(&coordinator) derpMap := tailnettest.RunDERPAndSTUN(t) - agentInactiveDisconnectTimeout := 1 * time.Hour + agentInactiveDisconnectTimeout := 1 * time.Hour // don't need to focus on this value in tests registry := prometheus.NewRegistry() // when @@ -265,6 +307,10 @@ func TestAgents(t *testing.T) { // then require.NoError(t, err) + + var agentsUp bool + var agentsConnections bool + var agentsApps bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -273,9 +319,34 @@ func TestAgents(t *testing.T) { return false } - for _, metric := range metrics[0].Metric { - panic(metric) + for _, metric := range metrics { + switch metric.GetName() { + case "coderd_agents_up": + assert.Equal(t, "testuser", metric.Metric[0].Label[0].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsUp = true + case "coderd_agents_connections": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "created", metric.Metric[0].Label[1].GetValue()) // Lifecycle state + assert.Equal(t, "connecting", metric.Metric[0].Label[2].GetValue()) // Status + assert.Equal(t, "unknown", metric.Metric[0].Label[3].GetValue()) // Tailnet node + assert.Equal(t, "testuser", metric.Metric[0].Label[4].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[5].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsConnections = true + case "coderd_agents_apps": + assert.Equal(t, "testagent", metric.Metric[0].Label[0].GetValue()) // Agent name + assert.Equal(t, "Fake application", metric.Metric[0].Label[1].GetValue()) // App name + assert.Equal(t, "disabled", metric.Metric[0].Label[2].GetValue()) // Health + assert.Equal(t, "testuser", metric.Metric[0].Label[3].GetValue()) // Username + assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name + assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value + agentsApps = true + default: + require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) + } } - return true + return agentsUp && agentsConnections && agentsApps }, testutil.WaitShort, testutil.IntervalFast) } From 7acbaf09ac86a9179d75949588099fecb708b3ad Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:19:39 +0200 Subject: [PATCH 10/15] docs --- docs/admin/prometheus.md | 4 ++++ scripts/metricsdocgen/metrics | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index f35ba5d1c5182..e6b23a12702f4 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -31,6 +31,10 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically | Name | Type | Description | Labels | | -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | | `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | | `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | | `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 50bbc87990dda..9a5fc20dff8e1 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -1,3 +1,23 @@ +# HELP coderd_agents_apps Agent applications with statuses. +# TYPE coderd_agents_apps gauge +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-1"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_apps{agent_name="main",app_name="code-server",health="healthy",username="admin",workspace_name="workspace-3"} 1 +# HELP coderd_agents_connection_latencies_seconds Agent connection latencies in seconds. +# TYPE coderd_agents_connection_latencies_seconds gauge +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-1"} 0.03018125 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-2"} 0.028658416 +coderd_agents_connection_latencies_seconds{agent_id="main",derp_region="Coder Embedded Relay",preferred="true",username="admin",workspace_name="workspace-3"} 0.028041416 +# HELP coderd_agents_connections Agent connections with statuses. +# TYPE coderd_agents_connections gauge +coderd_agents_connections{agent_name="main",lifecycle_state="ready",status="connected",tailnet_node="nodeid:16966f7df70d8cc5",username="admin",workspace_name="workspace-3"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3237d00938be23e3",username="admin",workspace_name="workspace-2"} 1 +coderd_agents_connections{agent_name="main",lifecycle_state="start_timeout",status="connected",tailnet_node="nodeid:3779bd45d00be0eb",username="admin",workspace_name="workspace-1"} 1 +# HELP coderd_agents_up The number of active agents per workspace. +# TYPE coderd_agents_up gauge +coderd_agents_up{username="admin",workspace_name="workspace-1"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-2"} 1 +coderd_agents_up{username="admin",workspace_name="workspace-3"} 1 # HELP coderd_api_websocket_durations_seconds Websocket duration distribution of requests in seconds. # TYPE coderd_api_websocket_durations_seconds histogram coderd_api_websocket_durations_seconds_bucket{path="/api/v2/workspaceagents/me/coordinate",le="0.001"} 0 From 7418779f7b202e2df48e0fbca70906488d6db9ca Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Wed, 5 Apr 2023 14:26:14 +0200 Subject: [PATCH 11/15] fmt --- coderd/prometheusmetrics/prometheusmetrics_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index e7829d8a34a18..f7c5fc689e937 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -3,11 +3,10 @@ package prometheusmetrics_test import ( "context" "database/sql" + "sync/atomic" "testing" "time" - "sync/atomic" - "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" "github.com/stretchr/testify/assert" From 3a8e4e6ddf9df8987858f8c30ac8c7f1d10c5495 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 11:52:01 +0200 Subject: [PATCH 12/15] Add timer to measure the metrics collection --- coderd/prometheusmetrics/prometheusmetrics.go | 18 ++- .../prometheusmetrics_test.go | 5 +- docs/admin/prometheus.md | 105 +++++++++--------- scripts/metricsdocgen/metrics | 16 +++ 4 files changed, 90 insertions(+), 54 deletions(-) diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 222809db00bf8..94c47230a4579 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -168,6 +168,18 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis return nil, err } + metricsCollectorAgents := prometheus.NewHistogram(prometheus.HistogramOpts{ + Namespace: "coderd", + Subsystem: "prometheusmetrics", + Name: "agents_execution_seconds", + Help: "Histogram for duration of agents metrics collection in seconds.", + Buckets: []float64{0.001, 0.005, 0.010, 0.025, 0.050, 0.100, 0.500, 1, 5, 10, 30}, + }) + err = registerer.Register(metricsCollectorAgents) + if err != nil { + return nil, err + } + // nolint:gocritic // Prometheus must collect metrics for all Coder users. ctx, cancelFunc := context.WithCancel(dbauthz.AsSystemRestricted(ctx)) ticker := time.NewTicker(duration) @@ -180,7 +192,8 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis case <-ticker.C: } - logger.Debug(ctx, "Collect agent metrics now") + logger.Debug(ctx, "Agent metrics collection is starting") + timer := prometheus.NewTimer(metricsCollectorAgents) workspaceRows, err := db.GetWorkspaces(ctx, database.GetWorkspacesParams{ AgentInactiveDisconnectTimeoutSeconds: int64(agentInactiveDisconnectTimeout.Seconds()), @@ -269,6 +282,9 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } } + + logger.Debug(ctx, "Agent metrics collection is done") + metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } }() return cancelFunc, nil diff --git a/coderd/prometheusmetrics/prometheusmetrics_test.go b/coderd/prometheusmetrics/prometheusmetrics_test.go index f7c5fc689e937..e765c5f2a1128 100644 --- a/coderd/prometheusmetrics/prometheusmetrics_test.go +++ b/coderd/prometheusmetrics/prometheusmetrics_test.go @@ -310,6 +310,7 @@ func TestAgents(t *testing.T) { var agentsUp bool var agentsConnections bool var agentsApps bool + var agentsExecutionInSeconds bool require.Eventually(t, func() bool { metrics, err := registry.Gather() assert.NoError(t, err) @@ -342,10 +343,12 @@ func TestAgents(t *testing.T) { assert.Equal(t, workspace.Name, metric.Metric[0].Label[4].GetValue()) // Workspace name assert.Equal(t, 1, int(metric.Metric[0].Gauge.GetValue())) // Metric value agentsApps = true + case "coderd_prometheusmetrics_agents_execution_seconds": + agentsExecutionInSeconds = true default: require.FailNowf(t, "unexpected metric collected", "metric: %s", metric.GetName()) } } - return agentsUp && agentsConnections && agentsApps + return agentsUp && agentsConnections && agentsApps && agentsExecutionInSeconds }, testutil.WaitShort, testutil.IntervalFast) } diff --git a/docs/admin/prometheus.md b/docs/admin/prometheus.md index e6b23a12702f4..2898f8f4a469c 100644 --- a/docs/admin/prometheus.md +++ b/docs/admin/prometheus.md @@ -29,57 +29,58 @@ The environment variable `CODER_PROMETHEUS_ENABLE` will be enabled automatically -| Name | Type | Description | Labels | -| -------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | -| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | -| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | -| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | -| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | -| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | -| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | -| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | -| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | -| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | -| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | -| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | -| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | -| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | -| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | -| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | -| `go_goroutines` | gauge | Number of goroutines that currently exist. | | -| `go_info` | gauge | Information about the Go environment. | `version` | -| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | -| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | -| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | -| `go_memstats_frees_total` | counter | Total number of frees. | | -| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | -| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | -| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | -| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | -| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | -| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | -| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | -| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | -| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | -| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | -| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | -| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | -| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | -| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | -| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | -| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | -| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | -| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | -| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | -| `go_threads` | gauge | Number of OS threads created. | | -| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | -| `process_max_fds` | gauge | Maximum number of open file descriptors. | | -| `process_open_fds` | gauge | Number of open file descriptors. | | -| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | -| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | -| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | -| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | -| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | -| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | +| Name | Type | Description | Labels | +| --------------------------------------------------- | --------- | ------------------------------------------------------------------ | ----------------------------------------------------------------------------------- | +| `coderd_agents_apps` | gauge | Agent applications with statuses. | `agent_name` `app_name` `health` `username` `workspace_name` | +| `coderd_agents_connection_latencies_seconds` | gauge | Agent connection latencies in seconds. | `agent_id` `derp_region` `preferred` `username` `workspace_name` | +| `coderd_agents_connections` | gauge | Agent connections with statuses. | `agent_name` `lifecycle_state` `status` `tailnet_node` `username` `workspace_name` | +| `coderd_agents_up` | gauge | The number of active agents per workspace. | `username` `workspace_name` | +| `coderd_api_active_users_duration_hour` | gauge | The number of users that have been active within the last hour. | | +| `coderd_api_concurrent_requests` | gauge | The number of concurrent API requests. | | +| `coderd_api_concurrent_websockets` | gauge | The total number of concurrent API websockets. | | +| `coderd_api_request_latencies_seconds` | histogram | Latency distribution of requests in seconds. | `method` `path` | +| `coderd_api_requests_processed_total` | counter | The total number of processed API requests | `code` `method` `path` | +| `coderd_api_websocket_durations_seconds` | histogram | Websocket duration distribution of requests in seconds. | `path` | +| `coderd_api_workspace_latest_build_total` | gauge | The latest workspace builds with a status. | `status` | +| `coderd_metrics_collector_agents_execution_seconds` | histogram | Histogram for duration of agents metrics collection in seconds. | | +| `coderd_provisionerd_job_timings_seconds` | histogram | The provisioner job time duration in seconds. | `provisioner` `status` | +| `coderd_provisionerd_jobs_current` | gauge | The number of currently running provisioner jobs. | `provisioner` | +| `coderd_workspace_builds_total` | counter | The number of workspaces started, updated, or deleted. | `action` `owner_email` `status` `template_name` `template_version` `workspace_name` | +| `go_gc_duration_seconds` | summary | A summary of the pause duration of garbage collection cycles. | | +| `go_goroutines` | gauge | Number of goroutines that currently exist. | | +| `go_info` | gauge | Information about the Go environment. | `version` | +| `go_memstats_alloc_bytes` | gauge | Number of bytes allocated and still in use. | | +| `go_memstats_alloc_bytes_total` | counter | Total number of bytes allocated, even if freed. | | +| `go_memstats_buck_hash_sys_bytes` | gauge | Number of bytes used by the profiling bucket hash table. | | +| `go_memstats_frees_total` | counter | Total number of frees. | | +| `go_memstats_gc_sys_bytes` | gauge | Number of bytes used for garbage collection system metadata. | | +| `go_memstats_heap_alloc_bytes` | gauge | Number of heap bytes allocated and still in use. | | +| `go_memstats_heap_idle_bytes` | gauge | Number of heap bytes waiting to be used. | | +| `go_memstats_heap_inuse_bytes` | gauge | Number of heap bytes that are in use. | | +| `go_memstats_heap_objects` | gauge | Number of allocated objects. | | +| `go_memstats_heap_released_bytes` | gauge | Number of heap bytes released to OS. | | +| `go_memstats_heap_sys_bytes` | gauge | Number of heap bytes obtained from system. | | +| `go_memstats_last_gc_time_seconds` | gauge | Number of seconds since 1970 of last garbage collection. | | +| `go_memstats_lookups_total` | counter | Total number of pointer lookups. | | +| `go_memstats_mallocs_total` | counter | Total number of mallocs. | | +| `go_memstats_mcache_inuse_bytes` | gauge | Number of bytes in use by mcache structures. | | +| `go_memstats_mcache_sys_bytes` | gauge | Number of bytes used for mcache structures obtained from system. | | +| `go_memstats_mspan_inuse_bytes` | gauge | Number of bytes in use by mspan structures. | | +| `go_memstats_mspan_sys_bytes` | gauge | Number of bytes used for mspan structures obtained from system. | | +| `go_memstats_next_gc_bytes` | gauge | Number of heap bytes when next garbage collection will take place. | | +| `go_memstats_other_sys_bytes` | gauge | Number of bytes used for other system allocations. | | +| `go_memstats_stack_inuse_bytes` | gauge | Number of bytes in use by the stack allocator. | | +| `go_memstats_stack_sys_bytes` | gauge | Number of bytes obtained from system for stack allocator. | | +| `go_memstats_sys_bytes` | gauge | Number of bytes obtained from system. | | +| `go_threads` | gauge | Number of OS threads created. | | +| `process_cpu_seconds_total` | counter | Total user and system CPU time spent in seconds. | | +| `process_max_fds` | gauge | Maximum number of open file descriptors. | | +| `process_open_fds` | gauge | Number of open file descriptors. | | +| `process_resident_memory_bytes` | gauge | Resident memory size in bytes. | | +| `process_start_time_seconds` | gauge | Start time of the process since unix epoch in seconds. | | +| `process_virtual_memory_bytes` | gauge | Virtual memory size in bytes. | | +| `process_virtual_memory_max_bytes` | gauge | Maximum amount of virtual memory available in bytes. | | +| `promhttp_metric_handler_requests_in_flight` | gauge | Current number of scrapes being served. | | +| `promhttp_metric_handler_requests_total` | counter | Total number of scrapes by HTTP status code. | `code` | diff --git a/scripts/metricsdocgen/metrics b/scripts/metricsdocgen/metrics index 9a5fc20dff8e1..7e598b17abe56 100644 --- a/scripts/metricsdocgen/metrics +++ b/scripts/metricsdocgen/metrics @@ -588,6 +588,22 @@ coderd_api_requests_processed_total{code="401",method="POST",path="/api/v2/files # HELP coderd_api_workspace_latest_build_total The latest workspace builds with a status. # TYPE coderd_api_workspace_latest_build_total gauge coderd_api_workspace_latest_build_total{status="succeeded"} 1 +# HELP coderd_metrics_collector_agents_execution_seconds Histogram for duration of agents metrics collection in seconds. +# TYPE coderd_metrics_collector_agents_execution_seconds histogram +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.001"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.005"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.01"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.025"} 0 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.05"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="0.5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="1"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="5"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="10"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="30"} 2 +coderd_metrics_collector_agents_execution_seconds_bucket{le="+Inf"} 2 +coderd_metrics_collector_agents_execution_seconds_sum 0.0592915 +coderd_metrics_collector_agents_execution_seconds_count 2 # HELP coderd_provisionerd_job_timings_seconds The provisioner job time duration in seconds. # TYPE coderd_provisionerd_job_timings_seconds histogram coderd_provisionerd_job_timings_seconds_bucket{provisioner="terraform",status="success",le="1"} 0 From b5d0581caec093a09ae40e0477af35e095202e82 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 13:26:11 +0200 Subject: [PATCH 13/15] Use CachedGaugeVec --- coderd/prometheusmetrics/collector.go | 80 +++++++++++++++++++ coderd/prometheusmetrics/prometheusmetrics.go | 40 +++++----- 2 files changed, 100 insertions(+), 20 deletions(-) create mode 100644 coderd/prometheusmetrics/collector.go diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go new file mode 100644 index 0000000000000..b92f992a6a7ec --- /dev/null +++ b/coderd/prometheusmetrics/collector.go @@ -0,0 +1,80 @@ +package prometheusmetrics + +import ( + "sync" + + "github.com/prometheus/client_golang/prometheus" +) + +type CachedGaugeVec struct { + m sync.Mutex + + gaugeVec *prometheus.GaugeVec + records []vectorRecord +} + +var _ prometheus.Collector = new(CachedGaugeVec) + +type VectorOperation int + +const ( + VectorOperationAdd VectorOperation = iota + VectorOperationSet +) + +type vectorRecord struct { + operation VectorOperation + value float64 + labelValues []string +} + +func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { + return &CachedGaugeVec{ + gaugeVec: gaugeVec, + } +} + +func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Describe(desc) +} + +func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Collect(ch) +} + +func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + v.m.Lock() + defer v.m.Unlock() + + v.records = append(v.records, vectorRecord{ + operation: operation, + value: value, + labelValues: labelValues, + }) +} + +func (v *CachedGaugeVec) Commit() { + v.m.Lock() + defer v.m.Unlock() + + v.gaugeVec.Reset() + for _, record := range v.records { + g := v.gaugeVec.WithLabelValues(record.labelValues...) + switch record.operation { + case VectorOperationAdd: + g.Add(record.value) + case VectorOperationSet: + g.Set(record.value) + default: + panic("unsupported vector operation") + } + } + + v.records = nil +} diff --git a/coderd/prometheusmetrics/prometheusmetrics.go b/coderd/prometheusmetrics/prometheusmetrics.go index 94c47230a4579..83e4af90d0765 100644 --- a/coderd/prometheusmetrics/prometheusmetrics.go +++ b/coderd/prometheusmetrics/prometheusmetrics.go @@ -124,45 +124,45 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis duration = 1 * time.Minute } - agentsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "up", Help: "The number of active agents per workspace.", - }, []string{"username", "workspace_name"}) + }, []string{"username", "workspace_name"})) err := registerer.Register(agentsGauge) if err != nil { return nil, err } - agentsConnectionsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connections", Help: "Agent connections with statuses.", - }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"}) + }, []string{"agent_name", "username", "workspace_name", "status", "lifecycle_state", "tailnet_node"})) err = registerer.Register(agentsConnectionsGauge) if err != nil { return nil, err } - agentsConnectionLatenciesGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsConnectionLatenciesGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "connection_latencies_seconds", Help: "Agent connection latencies in seconds.", - }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"}) + }, []string{"agent_id", "username", "workspace_name", "derp_region", "preferred"})) err = registerer.Register(agentsConnectionLatenciesGauge) if err != nil { return nil, err } - agentsAppsGauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{ + agentsAppsGauge := NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: "coderd", Subsystem: "agents", Name: "apps", Help: "Agent applications with statuses.", - }, []string{"agent_name", "username", "workspace_name", "app_name", "health"}) + }, []string{"agent_name", "username", "workspace_name", "app_name", "health"})) err = registerer.Register(agentsAppsGauge) if err != nil { return nil, err @@ -203,35 +203,30 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis continue } - agentsGauge.Reset() - agentsConnectionsGauge.Reset() - agentsConnectionLatenciesGauge.Reset() - agentsAppsGauge.Reset() - for _, workspace := range workspaceRows { user, err := db.GetUserByID(ctx, workspace.OwnerID) if err != nil { logger.Error(ctx, "can't get user", slog.F("user_id", workspace.OwnerID), slog.Error(err)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } agents, err := db.GetWorkspaceAgentsInLatestBuildByWorkspaceID(ctx, workspace.ID) if err != nil { logger.Error(ctx, "can't get workspace agents", slog.F("workspace_id", workspace.ID), slog.Error(err)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } if len(agents) == 0 { logger.Debug(ctx, "workspace agents are unavailable", slog.F("workspace_id", workspace.ID)) - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(0) + agentsGauge.WithLabelValues(VectorOperationAdd, 0, user.Username, workspace.Name) continue } for _, agent := range agents { // Collect information about agents - agentsGauge.WithLabelValues(user.Username, workspace.Name).Add(1) + agentsGauge.WithLabelValues(VectorOperationAdd, 1, user.Username, workspace.Name) connectionStatus := agent.Status(agentInactiveDisconnectTimeout) node := (*coordinator.Load()).Node(agent.ID) @@ -241,7 +236,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis tailnetNode = node.ID.String() } - agentsConnectionsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode).Set(1) + agentsConnectionsGauge.WithLabelValues(VectorOperationSet, 1, agent.Name, user.Username, workspace.Name, string(connectionStatus.Status), string(agent.LifecycleState), tailnetNode) if node == nil { logger.Debug(ctx, "can't read in-memory node for agent", slog.F("agent_id", agent.ID)) @@ -266,7 +261,7 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } } - agentsConnectionLatenciesGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)).Set(latency) + agentsConnectionLatenciesGauge.WithLabelValues(VectorOperationSet, latency, agent.Name, user.Username, workspace.Name, region.RegionName, fmt.Sprintf("%v", node.PreferredDERP == regionID)) } } @@ -278,11 +273,16 @@ func Agents(ctx context.Context, logger slog.Logger, registerer prometheus.Regis } for _, app := range apps { - agentsAppsGauge.WithLabelValues(agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)).Add(1) + agentsAppsGauge.WithLabelValues(VectorOperationAdd, 1, agent.Name, user.Username, workspace.Name, app.DisplayName, string(app.Health)) } } } + agentsGauge.Commit() + agentsConnectionsGauge.Commit() + agentsConnectionLatenciesGauge.Commit() + agentsAppsGauge.Commit() + logger.Debug(ctx, "Agent metrics collection is done") metricsCollectorAgents.Observe(timer.ObserveDuration().Seconds()) } From e4d708b8f36214712989605f1589ee0f1b9dc613 Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Thu, 6 Apr 2023 14:40:36 +0200 Subject: [PATCH 14/15] Unit tests --- coderd/prometheusmetrics/collector_test.go | 140 +++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 coderd/prometheusmetrics/collector_test.go diff --git a/coderd/prometheusmetrics/collector_test.go b/coderd/prometheusmetrics/collector_test.go new file mode 100644 index 0000000000000..9d63f6669113d --- /dev/null +++ b/coderd/prometheusmetrics/collector_test.go @@ -0,0 +1,140 @@ +package prometheusmetrics_test + +import ( + "sort" + "testing" + + "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/coder/coder/coderd/prometheusmetrics" +) + +func TestCollector_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 23, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 1, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 25, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 48, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 6, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 5, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func TestCollector_Set_Add(t *testing.T) { + t.Parallel() + + // given + agentsGauge := prometheusmetrics.NewCachedGaugeVec(prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: "coderd", + Subsystem: "agents", + Name: "up", + Help: "The number of active agents per workspace.", + }, []string{"username", "workspace_name"})) + + // when + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 9, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 8, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 7, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 6, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 5, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationSet, 4, "second user", "your workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 3, "first user", "my workspace") + agentsGauge.WithLabelValues(prometheusmetrics.VectorOperationAdd, 2, "second user", "your workspace") + agentsGauge.Commit() + + // then + ch := make(chan prometheus.Metric, 2) + agentsGauge.Collect(ch) + + metrics := collectAndSortMetrics(t, agentsGauge, 2) + + assert.Equal(t, "first user", metrics[0].Label[0].GetValue()) // Username + assert.Equal(t, "my workspace", metrics[0].Label[1].GetValue()) // Workspace name + assert.Equal(t, 8, int(metrics[0].Gauge.GetValue())) // Metric value + + assert.Equal(t, "second user", metrics[1].Label[0].GetValue()) // Username + assert.Equal(t, "your workspace", metrics[1].Label[1].GetValue()) // Workspace name + assert.Equal(t, 6, int(metrics[1].Gauge.GetValue())) // Metric value +} + +func collectAndSortMetrics(t *testing.T, collector prometheus.Collector, count int) []dto.Metric { + ch := make(chan prometheus.Metric, count) + defer close(ch) + + var metrics []dto.Metric + + collector.Collect(ch) + for i := 0; i < count; i++ { + m := <-ch + + var metric dto.Metric + err := m.Write(&metric) + require.NoError(t, err) + + metrics = append(metrics, metric) + } + + // Ensure always the same order of metrics + sort.Slice(metrics, func(i, j int) bool { + return sort.StringsAreSorted([]string{metrics[i].Label[0].GetValue(), metrics[j].Label[1].GetValue()}) + }) + return metrics +} From e0669f013cc947f8719ddd53b49e18086b8eb54d Mon Sep 17 00:00:00 2001 From: Marcin Tojek Date: Fri, 7 Apr 2023 17:38:01 +0200 Subject: [PATCH 15/15] Address PR comments --- coderd/prometheusmetrics/collector.go | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/coderd/prometheusmetrics/collector.go b/coderd/prometheusmetrics/collector.go index b92f992a6a7ec..8839553a1ffdd 100644 --- a/coderd/prometheusmetrics/collector.go +++ b/coderd/prometheusmetrics/collector.go @@ -6,6 +6,16 @@ import ( "github.com/prometheus/client_golang/prometheus" ) +// CachedGaugeVec is a wrapper for the prometheus.GaugeVec which allows +// for staging changes in the metrics vector. Calling "WithLabelValues(...)" +// will update the internal gauge value, but it will not be returned by +// "Collect(...)" until the "Commit()" method is called. The "Commit()" method +// resets the internal gauge and applies all staged changes to it. +// +// The Use of CachedGaugeVec is recommended for use cases when there is a risk +// that the Prometheus collector receives incomplete metrics, collected +// in the middle of metrics recalculation, between "Reset()" and the last +// "WithLabelValues()" call. type CachedGaugeVec struct { m sync.Mutex @@ -35,9 +45,6 @@ func NewCachedGaugeVec(gaugeVec *prometheus.GaugeVec) *CachedGaugeVec { } func (v *CachedGaugeVec) Describe(desc chan<- *prometheus.Desc) { - v.m.Lock() - defer v.m.Unlock() - v.gaugeVec.Describe(desc) } @@ -49,6 +56,13 @@ func (v *CachedGaugeVec) Collect(ch chan<- prometheus.Metric) { } func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float64, labelValues ...string) { + switch operation { + case VectorOperationAdd: + case VectorOperationSet: + default: + panic("unsupported vector operation") + } + v.m.Lock() defer v.m.Unlock() @@ -59,6 +73,9 @@ func (v *CachedGaugeVec) WithLabelValues(operation VectorOperation, value float6 }) } +// Commit will set the internal value as the cached value to return from "Collect()". +// The internal metric value is completely reset, so the caller should expect +// the gauge to be empty for the next 'WithLabelValues' values. func (v *CachedGaugeVec) Commit() { v.m.Lock() defer v.m.Unlock() @@ -71,8 +88,6 @@ func (v *CachedGaugeVec) Commit() { g.Add(record.value) case VectorOperationSet: g.Set(record.value) - default: - panic("unsupported vector operation") } }