Skip to content

Commit 0fcd543

Browse files
committed
feat(coderd): add prometheus metrics to servertailnet
1 parent c84a637 commit 0fcd543

File tree

3 files changed

+139
-2
lines changed

3 files changed

+139
-2
lines changed

coderd/coderd.go

+5-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ func New(options *Options) *API {
472472

473473
api.Auditor.Store(&options.Auditor)
474474
api.TailnetCoordinator.Store(&options.TailnetCoordinator)
475-
api.agentProvider, err = NewServerTailnet(api.ctx,
475+
stn, err := NewServerTailnet(api.ctx,
476476
options.Logger,
477477
options.DERPServer,
478478
api.DERPMap,
@@ -485,6 +485,10 @@ func New(options *Options) *API {
485485
if err != nil {
486486
panic("failed to setup server tailnet: " + err.Error())
487487
}
488+
api.agentProvider = stn
489+
if options.DeploymentValues.Prometheus.Enable {
490+
options.PrometheusRegistry.MustRegister(stn)
491+
}
488492
api.TailnetClientService, err = tailnet.NewClientService(
489493
api.Logger.Named("tailnetclient"),
490494
&api.TailnetCoordinator,

coderd/tailnet.go

+53-1
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ import (
1414
"time"
1515

1616
"github.com/google/uuid"
17+
"github.com/prometheus/client_golang/prometheus"
1718
"go.opentelemetry.io/otel/trace"
1819
"golang.org/x/xerrors"
1920
"tailscale.com/derp"
@@ -97,6 +98,18 @@ func NewServerTailnet(
9798
agentConnectionTimes: map[uuid.UUID]time.Time{},
9899
agentTickets: map[uuid.UUID]map[uuid.UUID]struct{}{},
99100
transport: tailnetTransport.Clone(),
101+
connsPerAgent: prometheus.NewGaugeVec(prometheus.GaugeOpts{
102+
Namespace: "coder",
103+
Subsystem: "servertailnet",
104+
Name: "open_conns",
105+
Help: "Total number of TCP connections currently open to workspace agents.",
106+
}, []string{"agent_id"}),
107+
totalConns: prometheus.NewCounterVec(prometheus.CounterOpts{
108+
Namespace: "coder",
109+
Subsystem: "servertailnet",
110+
Name: "total_conns",
111+
Help: "Total number of TCP connections made to workspace agents.",
112+
}, []string{"agent_id"}),
100113
}
101114
tn.transport.DialContext = tn.dialContext
102115
// These options are mostly just picked at random, and they can likely be
@@ -170,6 +183,16 @@ func NewServerTailnet(
170183
return tn, nil
171184
}
172185

186+
func (s *ServerTailnet) Describe(descs chan<- *prometheus.Desc) {
187+
s.connsPerAgent.Describe(descs)
188+
s.totalConns.Describe(descs)
189+
}
190+
191+
func (s *ServerTailnet) Collect(metrics chan<- prometheus.Metric) {
192+
s.connsPerAgent.Collect(metrics)
193+
s.totalConns.Collect(metrics)
194+
}
195+
173196
func (s *ServerTailnet) expireOldAgents() {
174197
const (
175198
tick = 5 * time.Minute
@@ -304,6 +327,9 @@ type ServerTailnet struct {
304327
agentTickets map[uuid.UUID]map[uuid.UUID]struct{}
305328

306329
transport *http.Transport
330+
331+
connsPerAgent *prometheus.GaugeVec
332+
totalConns *prometheus.CounterVec
307333
}
308334

309335
func (s *ServerTailnet) ReverseProxy(targetURL, dashboardURL *url.URL, agentID uuid.UUID) *httputil.ReverseProxy {
@@ -349,7 +375,18 @@ func (s *ServerTailnet) dialContext(ctx context.Context, network, addr string) (
349375
return nil, xerrors.Errorf("no agent id attached")
350376
}
351377

352-
return s.DialAgentNetConn(ctx, agentID, network, addr)
378+
nc, err := s.DialAgentNetConn(ctx, agentID, network, addr)
379+
if err != nil {
380+
return nil, err
381+
}
382+
383+
s.connsPerAgent.With(prometheus.Labels{"agent_id": agentID.String()}).Inc()
384+
s.totalConns.With(prometheus.Labels{"agent_id": agentID.String()}).Inc()
385+
return &instrumentedConn{
386+
Conn: nc,
387+
agentID: agentID,
388+
connsPerAgent: s.connsPerAgent,
389+
}, nil
353390
}
354391

355392
func (s *ServerTailnet) ensureAgent(agentID uuid.UUID) error {
@@ -455,3 +492,18 @@ func (s *ServerTailnet) Close() error {
455492
<-s.derpMapUpdaterClosed
456493
return nil
457494
}
495+
496+
type instrumentedConn struct {
497+
net.Conn
498+
499+
agentID uuid.UUID
500+
closeOnce sync.Once
501+
connsPerAgent *prometheus.GaugeVec
502+
}
503+
504+
func (c *instrumentedConn) Close() error {
505+
c.closeOnce.Do(func() {
506+
c.connsPerAgent.With(prometheus.Labels{"agent_id": c.agentID.String()}).Dec()
507+
})
508+
return c.Conn.Close()
509+
}

coderd/tailnet_test.go

+81
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ import (
1313
"testing"
1414

1515
"github.com/google/uuid"
16+
"github.com/prometheus/client_golang/prometheus"
17+
dto "github.com/prometheus/client_model/go"
1618
"github.com/spf13/afero"
1719
"github.com/stretchr/testify/assert"
1820
"github.com/stretchr/testify/require"
@@ -79,6 +81,43 @@ func TestServerTailnet_ReverseProxy(t *testing.T) {
7981
assert.Equal(t, http.StatusOK, res.StatusCode)
8082
})
8183

84+
t.Run("Metrics", func(t *testing.T) {
85+
t.Parallel()
86+
87+
ctx, cancel := context.WithTimeout(context.Background(), testutil.WaitLong)
88+
defer cancel()
89+
90+
agents, serverTailnet := setupServerTailnetAgent(t, 1)
91+
a := agents[0]
92+
93+
registry := prometheus.NewRegistry()
94+
require.NoError(t, registry.Register(serverTailnet))
95+
96+
u, err := url.Parse(fmt.Sprintf("http://127.0.0.1:%d", codersdk.WorkspaceAgentHTTPAPIServerPort))
97+
require.NoError(t, err)
98+
99+
rp := serverTailnet.ReverseProxy(u, u, a.id)
100+
101+
rw := httptest.NewRecorder()
102+
req := httptest.NewRequest(
103+
http.MethodGet,
104+
u.String(),
105+
nil,
106+
).WithContext(ctx)
107+
108+
rp.ServeHTTP(rw, req)
109+
res := rw.Result()
110+
defer res.Body.Close()
111+
112+
assert.Equal(t, http.StatusOK, res.StatusCode)
113+
require.Eventually(t, func() bool {
114+
metrics, err := registry.Gather()
115+
assert.NoError(t, err)
116+
return counterHasValue(t, metrics, 1, "coder_servertailnet_total_conns", a.id.String()) &&
117+
gaugeHasValue(t, metrics, 1, "coder_servertailnet_open_conns", a.id.String())
118+
}, testutil.WaitShort, testutil.IntervalFast)
119+
})
120+
82121
t.Run("HostRewrite", func(t *testing.T) {
83122
t.Parallel()
84123

@@ -328,3 +367,45 @@ func setupServerTailnetAgent(t *testing.T, agentNum int) ([]agentWithID, *coderd
328367

329368
return agents, serverTailnet
330369
}
370+
371+
func gaugeHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
372+
t.Helper()
373+
for _, family := range metrics {
374+
if family.GetName() != name {
375+
continue
376+
}
377+
ms := family.GetMetric()
378+
metricsLoop:
379+
for _, m := range ms {
380+
require.Equal(t, len(label), len(m.GetLabel()))
381+
for i, lv := range label {
382+
if lv != m.GetLabel()[i].GetValue() {
383+
continue metricsLoop
384+
}
385+
}
386+
return value == m.GetGauge().GetValue()
387+
}
388+
}
389+
return false
390+
}
391+
392+
func counterHasValue(t testing.TB, metrics []*dto.MetricFamily, value float64, name string, label ...string) bool {
393+
t.Helper()
394+
for _, family := range metrics {
395+
if family.GetName() != name {
396+
continue
397+
}
398+
ms := family.GetMetric()
399+
metricsLoop:
400+
for _, m := range ms {
401+
require.Equal(t, len(label), len(m.GetLabel()))
402+
for i, lv := range label {
403+
if lv != m.GetLabel()[i].GetValue() {
404+
continue metricsLoop
405+
}
406+
}
407+
return value == m.GetCounter().GetValue()
408+
}
409+
}
410+
return false
411+
}

0 commit comments

Comments
 (0)