Skip to content

chore: fix concurrent CommitQuota transactions for unrelated users/orgs #15261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 20 additions & 5 deletions coderd/database/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,26 @@ type DBTX interface {
GetContext(ctx context.Context, dest interface{}, query string, args ...interface{}) error
}

func WithSerialRetryCount(count int) func(*sqlQuerier) {
return func(q *sqlQuerier) {
q.serialRetryCount = count
}
}

// New creates a new database store using a SQL database connection.
func New(sdb *sql.DB) Store {
func New(sdb *sql.DB, opts ...func(*sqlQuerier)) Store {
dbx := sqlx.NewDb(sdb, "postgres")
return &sqlQuerier{
q := &sqlQuerier{
db: dbx,
sdb: dbx,
// This is an arbitrary number.
serialRetryCount: 3,
}

for _, opt := range opts {
opt(q)
}
return q
}

// TxOptions is used to pass some execution metadata to the callers.
Expand Down Expand Up @@ -104,6 +117,10 @@ type querier interface {
type sqlQuerier struct {
sdb *sqlx.DB
db DBTX

// serialRetryCount is the number of times to retry a transaction
// if it fails with a serialization error.
serialRetryCount int
}

func (*sqlQuerier) Wrappers() []string {
Expand Down Expand Up @@ -143,11 +160,9 @@ func (q *sqlQuerier) InTx(function func(Store) error, txOpts *TxOptions) error {
// If we are in a transaction already, the parent InTx call will handle the retry.
// We do not want to duplicate those retries.
if !inTx && sqlOpts.Isolation == sql.LevelSerializable {
// This is an arbitrarily chosen number.
const retryAmount = 3
var err error
attempts := 0
for attempts = 0; attempts < retryAmount; attempts++ {
for attempts = 0; attempts < q.serialRetryCount; attempts++ {
txOpts.executionCount++
err = q.runTx(function, sqlOpts)
if err == nil {
Expand Down
127 changes: 127 additions & 0 deletions coderd/database/dbfake/builder.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package dbfake

import (
"testing"

"github.com/google/uuid"
"github.com/stretchr/testify/require"

"github.com/coder/coder/v2/coderd/database"
"github.com/coder/coder/v2/coderd/database/dbauthz"
"github.com/coder/coder/v2/coderd/database/dbgen"
"github.com/coder/coder/v2/coderd/database/dbtime"
"github.com/coder/coder/v2/testutil"
)

type OrganizationBuilder struct {
t *testing.T
db database.Store
seed database.Organization
allUsersAllowance int32
members []uuid.UUID
groups map[database.Group][]uuid.UUID
}

func Organization(t *testing.T, db database.Store) OrganizationBuilder {
return OrganizationBuilder{
t: t,
db: db,
members: []uuid.UUID{},
groups: make(map[database.Group][]uuid.UUID),
}
}

type OrganizationResponse struct {
Org database.Organization
AllUsersGroup database.Group
Members []database.OrganizationMember
Groups []database.Group
}

func (b OrganizationBuilder) EveryoneAllowance(allowance int) OrganizationBuilder {
//nolint: revive // returns modified struct
b.allUsersAllowance = int32(allowance)
return b
}

func (b OrganizationBuilder) Seed(seed database.Organization) OrganizationBuilder {
//nolint: revive // returns modified struct
b.seed = seed
return b
}

func (b OrganizationBuilder) Members(users ...database.User) OrganizationBuilder {
for _, u := range users {
//nolint: revive // returns modified struct
b.members = append(b.members, u.ID)
}
return b
}

func (b OrganizationBuilder) Group(seed database.Group, members ...database.User) OrganizationBuilder {
//nolint: revive // returns modified struct
b.groups[seed] = []uuid.UUID{}
for _, u := range members {
//nolint: revive // returns modified struct
b.groups[seed] = append(b.groups[seed], u.ID)
}
return b
}

func (b OrganizationBuilder) Do() OrganizationResponse {
org := dbgen.Organization(b.t, b.db, b.seed)

ctx := testutil.Context(b.t, testutil.WaitShort)
//nolint:gocritic // builder code needs perms
ctx = dbauthz.AsSystemRestricted(ctx)
everyone, err := b.db.InsertAllUsersGroup(ctx, org.ID)
require.NoError(b.t, err)

if b.allUsersAllowance > 0 {
everyone, err = b.db.UpdateGroupByID(ctx, database.UpdateGroupByIDParams{
Name: everyone.Name,
DisplayName: everyone.DisplayName,
AvatarURL: everyone.AvatarURL,
QuotaAllowance: b.allUsersAllowance,
ID: everyone.ID,
})
require.NoError(b.t, err)
}

members := make([]database.OrganizationMember, 0)
if len(b.members) > 0 {
for _, u := range b.members {
newMem := dbgen.OrganizationMember(b.t, b.db, database.OrganizationMember{
UserID: u,
OrganizationID: org.ID,
CreatedAt: dbtime.Now(),
UpdatedAt: dbtime.Now(),
Roles: nil,
})
members = append(members, newMem)
}
}

groups := make([]database.Group, 0)
if len(b.groups) > 0 {
for g, users := range b.groups {
g.OrganizationID = org.ID
group := dbgen.Group(b.t, b.db, g)
groups = append(groups, group)

for _, u := range users {
dbgen.GroupMember(b.t, b.db, database.GroupMemberTable{
UserID: u,
GroupID: group.ID,
})
}
}
}

return OrganizationResponse{
Org: org,
AllUsersGroup: everyone,
Members: members,
Groups: groups,
}
}
2 changes: 2 additions & 0 deletions coderd/database/dbgen/dbgen.go
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,8 @@ func OrganizationMember(t testing.TB, db database.Store, orig database.Organizat
}

func Group(t testing.TB, db database.Store, orig database.Group) database.Group {
t.Helper()

name := takeFirst(orig.Name, testutil.GetRandomName(t))
group, err := db.InsertGroup(genCtx, database.InsertGroupParams{
ID: takeFirst(orig.ID, uuid.New()),
Expand Down
3 changes: 2 additions & 1 deletion coderd/database/dbtestutil/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,8 @@ func NewDB(t testing.TB, opts ...Option) (database.Store, pubsub.Pubsub) {
if o.dumpOnFailure {
t.Cleanup(func() { DumpOnFailure(t, connectionURL) })
}
db = database.New(sqlDB)
// Unit tests should not retry serial transaction failures.
db = database.New(sqlDB, database.WithSerialRetryCount(1))

ps, err = pubsub.New(context.Background(), o.logger, sqlDB, connectionURL)
require.NoError(t, err)
Expand Down
73 changes: 73 additions & 0 deletions coderd/database/dbtestutil/tx.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package dbtestutil

import (
"sync"
"testing"

"github.com/stretchr/testify/assert"
"golang.org/x/xerrors"

"github.com/coder/coder/v2/coderd/database"
)

type DBTx struct {
database.Store
mu sync.Mutex
done chan error
finalErr chan error
}

// StartTx starts a transaction and returns a DBTx object. This allows running
// 2 transactions concurrently in a test more easily.
// Example:
//
// a := StartTx(t, db, opts)
// b := StartTx(t, db, opts)
//
// a.GetUsers(...)
// b.GetUsers(...)
//
// require.NoError(t, a.Done()
func StartTx(t *testing.T, db database.Store, opts *database.TxOptions) *DBTx {
done := make(chan error)
finalErr := make(chan error)
txC := make(chan database.Store)

go func() {
t.Helper()
once := sync.Once{}
count := 0

err := db.InTx(func(store database.Store) error {
// InTx can be retried
once.Do(func() {
txC <- store
})
count++
if count > 1 {
// If you recursively call InTx, then don't use this.
t.Logf("InTx called more than once: %d", count)
assert.NoError(t, xerrors.New("InTx called more than once, this is not allowed with the StartTx helper"))
}

<-done
// Just return nil. The caller should be checking their own errors.
return nil
}, opts)
finalErr <- err
}()

txStore := <-txC
close(txC)

return &DBTx{Store: txStore, done: done, finalErr: finalErr}
}

// Done can only be called once. If you call it twice, it will panic.
func (tx *DBTx) Done() error {
tx.mu.Lock()
defer tx.mu.Unlock()

close(tx.done)
return <-tx.finalErr
}
3 changes: 3 additions & 0 deletions coderd/database/querier.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 18 additions & 7 deletions coderd/database/queries.sql.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 18 additions & 7 deletions coderd/database/queries/quotas.sql
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,34 @@ INNER JOIN groups ON
WITH latest_builds AS (
SELECT
DISTINCT ON
(workspace_id) id,
workspace_id,
daily_cost
(wb.workspace_id) wb.id,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the build ID is needed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't try and change the query too much beyond the seq scan.

It looks like it was never used at all:

-- name: GetQuotaConsumedForUser :one
WITH latest_builds AS (
SELECT
DISTINCT ON
(workspace_id) id,
workspace_id,
daily_cost
FROM
workspace_builds wb
ORDER BY
workspace_id,
created_at DESC
)
SELECT
coalesce(SUM(daily_cost), 0)::BIGINT
FROM
workspaces
JOIN latest_builds ON
latest_builds.workspace_id = workspaces.id
WHERE NOT deleted AND workspaces.owner_id = $1;

wb.workspace_id,
wb.daily_cost
FROM
workspace_builds wb
-- This INNER JOIN prevents a seq scan of the workspace_builds table.
-- Limit the rows to the absolute minimum required, which is all workspaces
-- in a given organization for a given user.
INNER JOIN
workspaces on wb.workspace_id = workspaces.id
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, so in the query plan you have now, it's doing this nested loop where it finds the workspace IDs, then for each workspace ID it's doing this bitmap query:

                          ->  Bitmap Heap Scan on workspace_builds wb  (cost=4.16..9.50 rows=2 width=28)
                                Recheck Cond: (workspace_id = workspaces_1.id)
                                ->  Bitmap Index Scan on workspace_builds_workspace_id_build_number_key  (cost=0.00..4.16 rows=2 width=0)
                                      Index Cond: (workspace_id = workspaces_1.id)

That is, it first scans the index to find the pages to load, then scans the pages with the Recheck Cond.

Do you know whether this results in page locks for the transaction, or tuple locks (I'm assuming these are row-level locks)? Page locks have a greater likelihood of catching unrelated transactions.

And, some suggestions:

  1. Can we just move the daily_cost directly to the workspace table? We only ever do computations with the most recent cost. If we really needed to keep it on the workspace_build for compatibility or querying history, we could put it in both places, and have this query only look at workspaces. That would remove a join as well. If we built an index of (org_id, owner_id) then we'd also be very unlikely to ever need to Seq scan the workspaces table for quotas.

  2. If we don't want to go that far, we could add an index ON workspace_builds (workspace_id, build_number, daily_cost) that would allow the quota query to compute the results right from the index, so it'd never have to a bitmap scan and read whole pages.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The particular locks acquired during execution of a query will depend on the plan used by the query, and multiple finer-grained locks (e.g., tuple locks) may be combined into fewer coarser-grained locks (e.g., page locks) during the course of the transaction to prevent exhaustion of the memory used to track the locks.

I was under the impression it was row-level locks, but I admit that was a result of my experiments with very few rows. The docs seem to imply it could take out larger locks.

Can we just move the daily_cost directly to the workspace table? We only ever do computations with the most recent cost. If we really needed to keep it on the workspace_build for compatibility or querying history, we could put it in both places, and have this query only look at workspaces. That would remove a join as well. If we built an index of (org_id, owner_id) then we'd also be very unlikely to ever need to Seq scan the workspaces table for quotas.

Originally there was a suspicion we were getting this error from different transactions interring with CommitQuota. But the error reported can only occur between 2 serializable transactions, meaning it is CommitQuota interfering with itself.

I thought about moving daily_cost to it's own table entirely, but I'm not sure that would actually improve much.

I think this index + sorting by build number would be a large win: workspace_builds (workspace_id, build_number, daily_cost)

As I look at this query more, I don't see why we need to inner join workspaces again. All the information can be pulled from the latest_build subquery.

If we add the index (org_id, owner_id) on workspaces as well that is another win here.

According to the quote I pulled though, is the goal just to reduce the number of rows touched to prevent the lock from having to go more "coarse" to the page lock? The text implies the behavior depends on the memory availability.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that we don't need to join to workspaces twice.

And yeah, my understanding is that with page locks, updating quota for an unrelated user could cause a serialization error if the builds happen to share a page. So it's undesirable to use page locks if we could get away with finer grained locking.

I realize that postgres can use page locks for memory reasons, and maybe there isn't anything we can do about that, but I'm also wondering whether it automatically uses page locks when it does a bitmap query, rather than tuple locks if we were able to use the index.

WHERE
workspaces.owner_id = @owner_id AND
workspaces.organization_id = @organization_id
ORDER BY
workspace_id,
created_at DESC
wb.workspace_id,
wb.created_at DESC
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might be faster to use wb.build_number, since the query planner has access to the workspace_builds_workspace_id_build_number_key index. That could be why it's having to do the

              ->  Sort  (cost=22.66..22.66 rows=2 width=44)
"                    Sort Key: wb.workspace_id, wb.created_at DESC"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wondered about that too. I had a specific goal and did not want to start mutating the query too much to solicit feedback on different things.

Can I throw this up in another PR?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, that works

)
SELECT
coalesce(SUM(daily_cost), 0)::BIGINT
FROM
workspaces
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like it's still doing a Seq scan on workspaces

                          ->  Seq Scan on workspaces workspaces_1  (cost=0.00..13.12 rows=1 width=16)
                                Filter: (owner_id = 'b4ed5c8a-725e-482d-b5a7-368a1dd7cd77'::uuid)

Doesn't that mean any update to the workspaces table will still cause a serialization error?

However! I think the query planner takes into account cardinality of the columns relative to the rows in the table, so it might make a different plan in a "real" deployment.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we still do a seq scan on the workspaces.

To guarantee true serializability PostgreSQL uses predicate locking, which means that it keeps locks which allow it to determine when a write would have had an impact on the result of a previous read from a concurrent transaction, had it run first.

The serializable lock only affects tables (rows) that it written to. The CommitQuota transaction only writes to workspace_builds. I have a test case that writes to the workspaces table, and it does not cause a serialization error.

Copy link
Contributor

@spikecurtis spikecurtis Oct 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do you square this with the observation that a Seq Scan on workspace_builds caused a serialization error? I think the PG docs say Seq Scan locks the entire relation.

Maybe it depends on the number of workspaces and the assumed cardinality.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Experimentally, I am running this:

  1. Begin TX
  2. GetQuotaConsumedForUser(user, org)
  3. UpdateWorkspaceBuildCostByID(workspace, 10)

The TX mode that is important here is SIReadLock.

Predicate locks in PostgreSQL, like in most other database systems, are based on data actually accessed by a transaction. These will show up in the pg_locks system view with a mode of SIReadLock.

What I see my change doing is GetQuotaConsumedForUser used to do:

992-<nil> [granted] workspace_builds/relation/SIReadLock: 
992-<nil> [granted] workspace_builds_pkey/page/SIReadLock: page=1
992-<nil> [granted] workspaces/relation/SIReadLock: 
992-<nil> [granted] workspaces_owner_id_lower_idx/page/SIReadLock: page=1

And now does:

929-<nil> [granted] workspace_builds_pkey/page/SIReadLock: page=1
929-<nil> [granted] workspace_builds_workspace_id_build_number_key/page/SIReadLock: page=1
929-<nil> [granted] workspaces/relation/SIReadLock: 
929-<nil> [granted] workspaces_owner_id_lower_idx/page/SIReadLock: page=1

So we went from a full relation lock on the table to one on the pkey (was there before too) and build number.

The workspaces table still has the same locks, and would be improved by your index suggestions. For this specific error mode, we insert into the workspace_builds table though, so I was honestly not even looking at other tables at this point.

With my query changes raw
930-<nil> [granted] <nil>/virtualxid/ExclusiveLock: waiting to acquire virtual tx id lock
929- 3848 [granted] <nil>/transactionid/ExclusiveLock: ???
929-<nil> [granted] <nil>/virtualxid/ExclusiveLock: waiting to acquire virtual tx id lock
930-<nil> [granted] pg_locks/relation/AccessShareLock: 
929-<nil> [granted] workspace_builds/relation/AccessShareLock: 
929-<nil> [granted] workspace_builds/relation/RowExclusiveLock: 
929-<nil> [granted] workspace_builds_job_id_key/relation/AccessShareLock: 
929-<nil> [granted] workspace_builds_job_id_key/relation/RowExclusiveLock: 
929-<nil> [granted] workspace_builds_pkey/relation/RowExclusiveLock: 
929-<nil> [granted] workspace_builds_pkey/relation/AccessShareLock: 
929-<nil> [granted] workspace_builds_pkey/page/SIReadLock: page=1
929-<nil> [granted] workspace_builds_workspace_id_build_number_key/relation/RowExclusiveLock: 
929-<nil> [granted] workspace_builds_workspace_id_build_number_key/relation/AccessShareLock: 
929-<nil> [granted] workspace_builds_workspace_id_build_number_key/page/SIReadLock: page=1
929-<nil> [granted] workspaces/relation/AccessShareLock: 
929-<nil> [granted] workspaces/relation/SIReadLock: 
929-<nil> [granted] workspaces_owner_id_lower_idx/relation/AccessShareLock: 
929-<nil> [granted] workspaces_owner_id_lower_idx/page/SIReadLock: page=1
929-<nil> [granted] workspaces_pkey/relation/AccessShareLock: 
Before my changes raw
993-<nil> [granted] <nil>/virtualxid/ExclusiveLock: waiting to acquire virtual tx id lock
992- 4039 [granted] <nil>/transactionid/ExclusiveLock: ???
992-<nil> [granted] <nil>/virtualxid/ExclusiveLock: waiting to acquire virtual tx id lock
993-<nil> [granted] pg_locks/relation/AccessShareLock: 
992-<nil> [granted] workspace_builds/relation/AccessShareLock: 
992-<nil> [granted] workspace_builds/relation/SIReadLock: 
992-<nil> [granted] workspace_builds/relation/RowExclusiveLock: 
992-<nil> [granted] workspace_builds_job_id_key/relation/AccessShareLock: 
992-<nil> [granted] workspace_builds_job_id_key/relation/RowExclusiveLock: 
992-<nil> [granted] workspace_builds_pkey/relation/RowExclusiveLock: 
992-<nil> [granted] workspace_builds_pkey/relation/AccessShareLock: 
992-<nil> [granted] workspace_builds_pkey/page/SIReadLock: page=1
992-<nil> [granted] workspace_builds_workspace_id_build_number_key/relation/RowExclusiveLock: 
992-<nil> [granted] workspace_builds_workspace_id_build_number_key/relation/AccessShareLock: 
992-<nil> [granted] workspaces/relation/AccessShareLock: 
992-<nil> [granted] workspaces/relation/SIReadLock: 
992-<nil> [granted] workspaces_owner_id_lower_idx/relation/AccessShareLock: 
992-<nil> [granted] workspaces_owner_id_lower_idx/page/SIReadLock: page=1
992-<nil> [granted] workspaces_pkey/relation/AccessShareLock: 

Disclaimer: I recognize the docs suggest these lock patterns can change with table size and memory availability. This is a very small dataset.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting that serializable transactions only interfere with other serializable transactions --- OK for now, but I feel like this puts us on shaky ground.

Today we insert new builds with repeatable read isolation, but do not set the daily_cost. If we ever started setting the cost but didn't up the isolation to serializable, we'd start seeing quota anomalies.

Or, if we ever introduce a new update query on workspaces that runs at serializable isolation, it will start interfering with every quota transaction.

Not necessarily for this PR, but I think we should still aim to remove the Seq Scan on workspaces.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or, if we ever introduce a new update query on workspaces that runs at serializable isolation, it will start interfering with every quota transaction.

Yes.

Not necessarily for this PR, but I think we should still aim to remove the Seq Scan on workspaces.

I agree. I will make a follow up PR (or issue if it takes to long) to address all these extra concerns.

In this PR I did disable retries, so a unit test is more likely to fail if we hit this (would be a valid flake).


I do not see a way to implement forward thinking protections if someone adds a new serializable TX. So you are correct, a change in the future can break things further, especially because the seq scan on the workspaces.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting that serializable transactions only interfere with other serializable transactions --- OK for now, but I feel like this puts us on shaky ground.

Just to clarify, I specifically meant the error pq: could not serialize access due to read/write dependencies among transactions, which was reported.

There is another error you can get that is not between 2 serializable transactions.

pq: could not serialize access due to concurrent update

JOIN latest_builds ON
latest_builds.workspace_id = workspaces.id
WHERE NOT
deleted AND
WHERE
NOT deleted AND
-- We can likely remove these conditions since we check above.
-- But it does not hurt to be defensive and make sure future query changes
-- do not break anything.
workspaces.owner_id = @owner_id AND
workspaces.organization_id = @organization_id
;
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the fix. Everything else is just tests

Loading
Loading