Skip to content

Commit a97fd53

Browse files
committed
chore: increase fileCache hit rate in autobuilds lifecycle
1 parent 129235b commit a97fd53

File tree

5 files changed

+37
-12
lines changed

5 files changed

+37
-12
lines changed

coderd/autobuild/lifecycle_executor.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"database/sql"
66
"fmt"
77
"net/http"
8+
"slices"
9+
"strings"
810
"sync"
911
"sync/atomic"
1012
"time"
@@ -155,6 +157,22 @@ func (e *Executor) runOnce(t time.Time) Stats {
155157
return stats
156158
}
157159

160+
// Sort the workspaces by build template version ID so that we can group
161+
// identical template versions together. This is a slight (and imperfect)
162+
// optimization.
163+
//
164+
// `wsbuilder` needs to load the terraform files for a given template version
165+
// into memory. If 2 workspaces are using the same template version, they will
166+
// share the same files in the FileCache. This only happens if the builds happen
167+
// in parallel.
168+
// TODO: Actually make sure the cache has the files in the cache for the full
169+
// set of identical template versions. Then unload the files when the builds
170+
// are done. Right now, this relies on luck for the 10 goroutine workers to
171+
// overlap and keep the file reference in the cache alive.
172+
slices.SortFunc(workspaces, func(a, b database.GetWorkspacesEligibleForTransitionRow) int {
173+
return strings.Compare(a.BuildTemplateVersionID.UUID.String(), b.BuildTemplateVersionID.UUID.String())
174+
})
175+
158176
// We only use errgroup here for convenience of API, not for early
159177
// cancellation. This means we only return nil errors in th eg.Go.
160178
eg := errgroup.Group{}

coderd/database/queries.sql.go

Lines changed: 6 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

coderd/database/queries/workspaces.sql

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -642,7 +642,8 @@ FROM pending_workspaces, building_workspaces, running_workspaces, failed_workspa
642642
-- name: GetWorkspacesEligibleForTransition :many
643643
SELECT
644644
workspaces.id,
645-
workspaces.name
645+
workspaces.name,
646+
workspace_builds.template_version_id as build_template_version_id
646647
FROM
647648
workspaces
648649
LEFT JOIN

coderd/files/cache.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,12 @@ func (c *Cache) registerMetrics(registerer prometheus.Registerer) *Cache {
7171
Help: "The count of file references currently open in the file cache. Multiple references can be held for the same file.",
7272
})
7373

74-
c.totalOpenFileReferences = f.NewCounter(prometheus.CounterOpts{
74+
c.totalOpenFileReferences = f.NewCounterVec(prometheus.CounterOpts{
7575
Namespace: "coderd",
7676
Subsystem: subsystem,
7777
Name: "open_file_refs_total",
78-
Help: "The total number of file references ever opened in the file cache.",
79-
})
78+
Help: "The total number of file references ever opened in the file cache. The label 'hit' indicates whether the file was already in the cache ('true') or not ('false').",
79+
}, []string{"hit"})
8080

8181
return c
8282
}
@@ -97,7 +97,7 @@ type Cache struct {
9797

9898
type cacheMetrics struct {
9999
currentOpenFileReferences prometheus.Gauge
100-
totalOpenFileReferences prometheus.Counter
100+
totalOpenFileReferences *prometheus.CounterVec
101101

102102
currentOpenFiles prometheus.Gauge
103103
totalOpenedFiles prometheus.Counter
@@ -173,6 +173,7 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID
173173
c.lock.Lock()
174174
defer c.lock.Unlock()
175175

176+
hitLabel := "true"
176177
entry, ok := c.data[fileID]
177178
if !ok {
178179
value := lazy.NewWithError(func() (CacheEntryValue, error) {
@@ -194,10 +195,11 @@ func (c *Cache) prepare(ctx context.Context, db database.Store, fileID uuid.UUID
194195
c.data[fileID] = entry
195196
c.currentOpenFiles.Inc()
196197
c.totalOpenedFiles.Inc()
198+
hitLabel = "false"
197199
}
198200

199201
c.currentOpenFileReferences.Inc()
200-
c.totalOpenFileReferences.Inc()
202+
c.totalOpenFileReferences.WithLabelValues(hitLabel).Inc()
201203
entry.refCount++
202204
return entry.value
203205
}

coderd/files/cache_test.go

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,10 @@ func TestConcurrency(t *testing.T) {
161161
require.Equal(t, batches, promhelp.GaugeValue(t, reg, cachePromMetricName("open_files_current"), nil))
162162
require.Equal(t, batches, promhelp.CounterValue(t, reg, cachePromMetricName("open_files_total"), nil))
163163
require.Equal(t, batches*batchSize, promhelp.GaugeValue(t, reg, cachePromMetricName("open_file_refs_current"), nil))
164-
require.Equal(t, batches*batchSize, promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), nil))
164+
hit, miss := promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), prometheus.Labels{"hit": "false"}),
165+
promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), prometheus.Labels{"hit": "true"})
166+
require.Equal(t, batches*batchSize, hit+miss)
167+
165168
}
166169

167170
func TestRelease(t *testing.T) {
@@ -245,7 +248,6 @@ func TestRelease(t *testing.T) {
245248
// Total counts remain
246249
require.Equal(t, batches*fileSize, promhelp.CounterValue(t, reg, cachePromMetricName("open_files_size_bytes_total"), nil))
247250
require.Equal(t, batches, promhelp.CounterValue(t, reg, cachePromMetricName("open_files_total"), nil))
248-
require.Equal(t, batches*batchSize, promhelp.CounterValue(t, reg, cachePromMetricName("open_file_refs_total"), nil))
249251
}
250252

251253
func cacheAuthzSetup(t *testing.T) (database.Store, *files.Cache, *coderdtest.RecordingAuthorizer) {

0 commit comments

Comments
 (0)