Skip to content

Commit 795050b

Browse files
authored
chore: add prometheus monitoring of workspace traffic generation (#7583)
- Exposes reads/writes from scaletest traffic generation (default: 0.0.0.0:21112) - Adds self-hosted prometheus with remote_write to loadtest terraform - Adds convenience script to run a traffic generation test
1 parent 0fd2ea4 commit 795050b

18 files changed

+630
-120
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,4 @@ site/stats/
5959
./scaletest/terraform/.terraform
6060
./scaletest/terraform/.terraform.lock.hcl
6161
terraform.tfstate.*
62+
**/*.tfvars

.prettierignore

+1
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ site/stats/
6262
./scaletest/terraform/.terraform
6363
./scaletest/terraform/.terraform.lock.hcl
6464
terraform.tfstate.*
65+
**/*.tfvars
6566
# .prettierignore.include:
6667
# Helm templates contain variables that are invalid YAML and can't be formatted
6768
# by Prettier.

cli/scaletest.go

+47-10
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,14 @@ import (
1414
"time"
1515

1616
"github.com/google/uuid"
17+
"github.com/prometheus/client_golang/prometheus"
18+
"github.com/prometheus/client_golang/prometheus/promhttp"
1719
"go.opentelemetry.io/otel/trace"
1820
"golang.org/x/xerrors"
1921

22+
"cdr.dev/slog"
23+
"cdr.dev/slog/sloggers/sloghuman"
24+
2025
"github.com/coder/coder/cli/clibase"
2126
"github.com/coder/coder/cli/cliui"
2227
"github.com/coder/coder/coderd/httpapi"
@@ -896,8 +901,11 @@ func (r *RootCmd) scaletestCreateWorkspaces() *clibase.Cmd {
896901

897902
func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
898903
var (
899-
tickInterval time.Duration
900-
bytesPerTick int64
904+
tickInterval time.Duration
905+
bytesPerTick int64
906+
scaletestPrometheusAddress string
907+
scaletestPrometheusWait time.Duration
908+
901909
client = &codersdk.Client{}
902910
tracingFlags = &scaletestTracingFlags{}
903911
strategy = &scaletestStrategyFlags{}
@@ -913,6 +921,12 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
913921
),
914922
Handler: func(inv *clibase.Invocation) error {
915923
ctx := inv.Context()
924+
reg := prometheus.NewRegistry()
925+
metrics := workspacetraffic.NewMetrics(reg, "username", "workspace_name", "agent_name")
926+
927+
logger := slog.Make(sloghuman.Sink(io.Discard))
928+
prometheusSrvClose := ServeHandler(ctx, logger, promhttp.HandlerFor(reg, promhttp.HandlerOpts{}), scaletestPrometheusAddress, "prometheus")
929+
defer prometheusSrvClose()
916930

917931
// Bypass rate limiting
918932
client.HTTPClient = &http.Client{
@@ -943,6 +957,9 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
943957
_, _ = fmt.Fprintln(inv.Stderr, "\nUploading traces...")
944958
if err := closeTracing(ctx); err != nil {
945959
_, _ = fmt.Fprintf(inv.Stderr, "\nError uploading traces: %+v\n", err)
960+
// Wait for prometheus metrics to be scraped
961+
_, _ = fmt.Fprintf(inv.Stderr, "Waiting %s for prometheus metrics to be scraped\n", scaletestPrometheusWait)
962+
<-time.After(scaletestPrometheusWait)
946963
}
947964
}()
948965
tracer := tracerProvider.Tracer(scaletestTracerName)
@@ -955,16 +972,18 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
955972
th := harness.NewTestHarness(strategy.toStrategy(), cleanupStrategy.toStrategy())
956973
for idx, ws := range workspaces {
957974
var (
958-
agentID uuid.UUID
959-
name = "workspace-traffic"
960-
id = strconv.Itoa(idx)
975+
agentID uuid.UUID
976+
agentName string
977+
name = "workspace-traffic"
978+
id = strconv.Itoa(idx)
961979
)
962980

963981
for _, res := range ws.LatestBuild.Resources {
964982
if len(res.Agents) == 0 {
965983
continue
966984
}
967985
agentID = res.Agents[0].ID
986+
agentName = res.Agents[0].Name
968987
}
969988

970989
if agentID == uuid.Nil {
@@ -974,16 +993,20 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
974993

975994
// Setup our workspace agent connection.
976995
config := workspacetraffic.Config{
977-
AgentID: agentID,
978-
BytesPerTick: bytesPerTick,
979-
Duration: strategy.timeout,
980-
TickInterval: tickInterval,
996+
AgentID: agentID,
997+
AgentName: agentName,
998+
BytesPerTick: bytesPerTick,
999+
Duration: strategy.timeout,
1000+
TickInterval: tickInterval,
1001+
WorkspaceName: ws.Name,
1002+
WorkspaceOwner: ws.OwnerName,
1003+
Registry: reg,
9811004
}
9821005

9831006
if err := config.Validate(); err != nil {
9841007
return xerrors.Errorf("validate config: %w", err)
9851008
}
986-
var runner harness.Runnable = workspacetraffic.NewRunner(client, config)
1009+
var runner harness.Runnable = workspacetraffic.NewRunner(client, config, metrics)
9871010
if tracingEnabled {
9881011
runner = &runnableTraceWrapper{
9891012
tracer: tracer,
@@ -1034,6 +1057,20 @@ func (r *RootCmd) scaletestWorkspaceTraffic() *clibase.Cmd {
10341057
Description: "How often to send traffic.",
10351058
Value: clibase.DurationOf(&tickInterval),
10361059
},
1060+
{
1061+
Flag: "scaletest-prometheus-address",
1062+
Env: "CODER_SCALETEST_PROMETHEUS_ADDRESS",
1063+
Default: "0.0.0.0:21112",
1064+
Description: "Address on which to expose scaletest Prometheus metrics.",
1065+
Value: clibase.StringOf(&scaletestPrometheusAddress),
1066+
},
1067+
{
1068+
Flag: "scaletest-prometheus-wait",
1069+
Env: "CODER_SCALETEST_PROMETHEUS_WAIT",
1070+
Default: "5s",
1071+
Description: "How long to wait before exiting in order to allow Prometheus metrics to be scraped.",
1072+
Value: clibase.DurationOf(&scaletestPrometheusWait),
1073+
},
10371074
}
10381075

10391076
tracingFlags.attach(&cmd.Options)

cli/scaletest_test.go

+7-54
Original file line numberDiff line numberDiff line change
@@ -8,17 +8,12 @@ import (
88
"path/filepath"
99
"testing"
1010

11-
"github.com/google/uuid"
1211
"github.com/stretchr/testify/assert"
1312
"github.com/stretchr/testify/require"
1413

15-
"github.com/coder/coder/agent"
1614
"github.com/coder/coder/cli/clitest"
1715
"github.com/coder/coder/coderd/coderdtest"
1816
"github.com/coder/coder/codersdk"
19-
"github.com/coder/coder/codersdk/agentsdk"
20-
"github.com/coder/coder/provisioner/echo"
21-
"github.com/coder/coder/provisionersdk/proto"
2217
"github.com/coder/coder/pty/ptytest"
2318
"github.com/coder/coder/scaletest/harness"
2419
"github.com/coder/coder/testutil"
@@ -205,70 +200,28 @@ param3: 1
205200
})
206201
}
207202

208-
// This test pretends to stand up a workspace and run a no-op traffic generation test.
209-
// It's not a real test, but it's useful for debugging.
210-
// We do not perform any cleanup.
203+
// This test just validates that the CLI command accepts its known arguments.
204+
// A more comprehensive test is performed in workspacetraffic/run_test.go
211205
func TestScaleTestWorkspaceTraffic(t *testing.T) {
212206
t.Parallel()
213207

214208
ctx, cancelFunc := context.WithTimeout(context.Background(), testutil.WaitMedium)
215209
defer cancelFunc()
216210

217-
client := coderdtest.New(t, &coderdtest.Options{IncludeProvisionerDaemon: true})
218-
user := coderdtest.CreateFirstUser(t, client)
219-
220-
authToken := uuid.NewString()
221-
version := coderdtest.CreateTemplateVersion(t, client, user.OrganizationID, &echo.Responses{
222-
Parse: echo.ParseComplete,
223-
ProvisionPlan: echo.ProvisionComplete,
224-
ProvisionApply: []*proto.Provision_Response{{
225-
Type: &proto.Provision_Response_Complete{
226-
Complete: &proto.Provision_Complete{
227-
Resources: []*proto.Resource{{
228-
Name: "example",
229-
Type: "aws_instance",
230-
Agents: []*proto.Agent{{
231-
Id: uuid.NewString(),
232-
Name: "agent",
233-
Auth: &proto.Agent_Token{
234-
Token: authToken,
235-
},
236-
Apps: []*proto.App{},
237-
}},
238-
}},
239-
},
240-
},
241-
}},
242-
})
243-
template := coderdtest.CreateTemplate(t, client, user.OrganizationID, version.ID)
244-
coderdtest.AwaitTemplateVersionJob(t, client, version.ID)
245-
246-
ws := coderdtest.CreateWorkspace(t, client, user.OrganizationID, template.ID, func(cwr *codersdk.CreateWorkspaceRequest) {
247-
cwr.Name = "scaletest-test"
248-
})
249-
coderdtest.AwaitWorkspaceBuildJob(t, client, ws.LatestBuild.ID)
250-
251-
agentClient := agentsdk.New(client.URL)
252-
agentClient.SetSessionToken(authToken)
253-
agentCloser := agent.New(agent.Options{
254-
Client: agentClient,
255-
})
256-
t.Cleanup(func() {
257-
_ = agentCloser.Close()
258-
})
259-
260-
coderdtest.AwaitWorkspaceAgents(t, client, ws.ID)
211+
client := coderdtest.New(t, nil)
212+
_ = coderdtest.CreateFirstUser(t, client)
261213

262214
inv, root := clitest.New(t, "scaletest", "workspace-traffic",
263215
"--timeout", "1s",
264216
"--bytes-per-tick", "1024",
265217
"--tick-interval", "100ms",
218+
"--scaletest-prometheus-address", "127.0.0.1:0",
219+
"--scaletest-prometheus-wait", "0s",
266220
)
267221
clitest.SetupConfig(t, client, root)
268222
var stdout, stderr bytes.Buffer
269223
inv.Stdout = &stdout
270224
inv.Stderr = &stderr
271225
err := inv.WithContext(ctx).Run()
272-
require.NoError(t, err)
273-
require.Contains(t, stdout.String(), "Pass: 1")
226+
require.ErrorContains(t, err, "no scaletest workspaces exist")
274227
}

cli/testdata/coder_scaletest_workspace-traffic_--help.golden

+7
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,13 @@ Generate traffic to scaletest workspaces through coderd
2727
Output format specs in the format "<format>[:<path>]". Not specifying
2828
a path will default to stdout. Available formats: text, json.
2929

30+
--scaletest-prometheus-address string, $CODER_SCALETEST_PROMETHEUS_ADDRESS (default: 0.0.0.0:21112)
31+
Address on which to expose scaletest Prometheus metrics.
32+
33+
--scaletest-prometheus-wait duration, $CODER_SCALETEST_PROMETHEUS_WAIT (default: 5s)
34+
How long to wait before exiting in order to allow Prometheus metrics
35+
to be scraped.
36+
3037
--tick-interval duration, $CODER_SCALETEST_WORKSPACE_TRAFFIC_TICK_INTERVAL (default: 100ms)
3138
How often to send traffic.
3239

docs/cli/scaletest_workspace-traffic.md

+20
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,26 @@ Timeout per job. Jobs may take longer to complete under higher concurrency limit
8282

8383
Output format specs in the format "<format>[:<path>]". Not specifying a path will default to stdout. Available formats: text, json.
8484

85+
### --scaletest-prometheus-address
86+
87+
| | |
88+
| ----------- | ------------------------------------------------ |
89+
| Type | <code>string</code> |
90+
| Environment | <code>$CODER_SCALETEST_PROMETHEUS_ADDRESS</code> |
91+
| Default | <code>0.0.0.0:21112</code> |
92+
93+
Address on which to expose scaletest Prometheus metrics.
94+
95+
### --scaletest-prometheus-wait
96+
97+
| | |
98+
| ----------- | --------------------------------------------- |
99+
| Type | <code>duration</code> |
100+
| Environment | <code>$CODER_SCALETEST_PROMETHEUS_WAIT</code> |
101+
| Default | <code>5s</code> |
102+
103+
How long to wait before exiting in order to allow Prometheus metrics to be scraped.
104+
85105
### --tick-interval
86106

87107
| | |

scaletest/terraform/README.md

+5-2
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,12 @@ project_id = "some_google_project_id"
3232
1. Run `coder_init.sh <coder_url>` to setup an initial user and a pre-configured Kubernetes
3333
template. It will also download the Coder CLI from the Coder instance locally.
3434

35-
1. Do whatever you need to do with the Coder instance.
35+
1. Do whatever you need to do with the Coder instance:
3636

37-
> To run Coder commands against the instance, you can use `coder_shim.sh <command>`.
37+
> Note: To run Coder commands against the instance, you can use `coder_shim.sh <command>`.
3838
> You don't need to run `coder login` yourself.
3939
40+
- To create workspaces, run `./coder_shim.sh scaletest create-workspaces --template="kubernetes" --count=N`
41+
- To generate workspace traffic, run `./coder_trafficgen.sh <name of loadtest from your Terraform vars>`. This will keep running until you delete the pod `coder-scaletest-workspace-traffic`.
42+
4043
1. When you are finished, you can run `terraform destroy -var-file=override.tfvars`.

scaletest/terraform/coder.tf

-28
Original file line numberDiff line numberDiff line change
@@ -128,34 +128,6 @@ EOF
128128
]
129129
}
130130

131-
resource "local_file" "coder-monitoring-manifest" {
132-
filename = "${path.module}/.coderv2/coder-monitoring.yaml"
133-
content = <<EOF
134-
apiVersion: monitoring.googleapis.com/v1
135-
kind: PodMonitoring
136-
metadata:
137-
namespace: ${kubernetes_namespace.coder_namespace.metadata.0.name}
138-
name: coder-monitoring
139-
spec:
140-
selector:
141-
matchLabels:
142-
app.kubernetes.io/name: coder
143-
endpoints:
144-
- port: prometheus-http
145-
interval: 30s
146-
EOF
147-
}
148-
149-
resource "null_resource" "coder-monitoring-manifest_apply" {
150-
provisioner "local-exec" {
151-
working_dir = "${abspath(path.module)}/.coderv2"
152-
command = <<EOF
153-
KUBECONFIG=${var.name}-cluster.kubeconfig gcloud container clusters get-credentials ${var.name}-cluster --project=${var.project_id} --zone=${var.zone} && \
154-
KUBECONFIG=${var.name}-cluster.kubeconfig kubectl apply -f ${abspath(local_file.coder-monitoring-manifest.filename)}
155-
EOF
156-
}
157-
}
158-
159131
resource "local_file" "kubernetes_template" {
160132
filename = "${path.module}/.coderv2/templates/kubernetes/main.tf"
161133
content = <<EOF
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/usr/bin/env bash
2+
3+
set -euo pipefail
4+
5+
if [[ $# -lt 1 ]]; then
6+
echo "Usage: $0 <loadtest name>"
7+
exit 1
8+
fi
9+
10+
# Allow toggling verbose output
11+
[[ -n ${VERBOSE:-} ]] && set -x
12+
13+
LOADTEST_NAME="$1"
14+
CODER_TOKEN=$(./coder_shim.sh tokens create)
15+
CODER_URL="http://coder.coder-${LOADTEST_NAME}.svc.cluster.local"
16+
export KUBECONFIG="${PWD}/.coderv2/${LOADTEST_NAME}-cluster.kubeconfig"
17+
18+
cat <<EOF | kubectl apply -f -
19+
apiVersion: v1
20+
kind: Pod
21+
metadata:
22+
name: coder-scaletest-workspace-traffic
23+
namespace: coder-${LOADTEST_NAME}
24+
labels:
25+
app.kubernetes.io/name: coder-scaletest-workspace-traffic
26+
spec:
27+
affinity:
28+
nodeAffinity:
29+
requiredDuringSchedulingIgnoredDuringExecution:
30+
nodeSelectorTerms:
31+
- matchExpressions:
32+
- key: cloud.google.com/gke-nodepool
33+
operator: In
34+
values:
35+
- ${LOADTEST_NAME}-misc
36+
containers:
37+
- command:
38+
- sh
39+
- -c
40+
- "curl -fsSL $CODER_URL/bin/coder-linux-amd64 -o /tmp/coder && chmod +x /tmp/coder && /tmp/coder --url=$CODER_URL --token=$CODER_TOKEN scaletest workspace-traffic"
41+
env:
42+
- name: CODER_URL
43+
value: $CODER_URL
44+
- name: CODER_TOKEN
45+
value: $CODER_TOKEN
46+
- name: CODER_SCALETEST_PROMETHEUS_ADDRESS
47+
value: "0.0.0.0:21112"
48+
- name: CODER_SCALETEST_JOB_TIMEOUT
49+
value: "30m"
50+
- name: CODER_SCALETEST_CONCURRENCY
51+
value: "0"
52+
- name: CODER_SCALETEST_WORKSPACE_TRAFFIC_BYTES_PER_TICK
53+
value: "2048"
54+
ports:
55+
- containerPort: 21112
56+
name: prometheus-http
57+
protocol: TCP
58+
name: cli
59+
image: docker.io/codercom/enterprise-minimal:ubuntu
60+
---
61+
apiVersion: monitoring.coreos.com/v1
62+
kind: PodMonitor
63+
metadata:
64+
namespace: coder-${LOADTEST_NAME}
65+
name: coder-workspacetraffic-monitoring
66+
spec:
67+
selector:
68+
matchLabels:
69+
app.kubernetes.io/name: coder-scaletest-workspace-traffic
70+
podMetricsEndpoints:
71+
- port: prometheus-http
72+
interval: 15s
73+
EOF

0 commit comments

Comments
 (0)