Skip to content

feat: integrate agentAPI with resources monitoring logic #16438

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
9d42cad
work on new agent version
defelmnq Feb 4, 2025
7b2d19e
improve function for resources monitoring
defelmnq Feb 10, 2025
0124d60
add missing files
defelmnq Feb 10, 2025
3661e8c
work on resources monitor tests
defelmnq Feb 11, 2025
a5a788e
apply fmt and lint
defelmnq Feb 11, 2025
91d1515
work on dbauthz tests
defelmnq Feb 11, 2025
0bc7632
work on dbauthz
defelmnq Feb 11, 2025
3085041
work on rbac
defelmnq Feb 11, 2025
120a37b
continue to iterate
defelmnq Feb 11, 2025
dd8ed40
continue to iterate
defelmnq Feb 11, 2025
0a8941b
continue to iterate
defelmnq Feb 11, 2025
f3388b4
work on tests
defelmnq Feb 11, 2025
523f6fd
improve testing
defelmnq Feb 11, 2025
06adbf7
improve error messages
defelmnq Feb 11, 2025
c7b03d0
rework architecture of resources monitor
defelmnq Feb 12, 2025
2c3d171
improve resourcesmonitor struct
defelmnq Feb 12, 2025
18b65e0
improve resourcesmonitor struct
defelmnq Feb 12, 2025
c95b05a
change proto payload for get resources monitoring config
defelmnq Feb 12, 2025
c79b6cb
change proto payload for get resources monitoring config
defelmnq Feb 12, 2025
b28d4fa
rework fetcher and tests
defelmnq Feb 13, 2025
7701624
fix tests
defelmnq Feb 13, 2025
5fad903
fix tests
defelmnq Feb 13, 2025
b611ae5
fix tests
defelmnq Feb 13, 2025
3c65b8a
fix logic
defelmnq Feb 13, 2025
63c5869
improve testing fetcher and rename struct
defelmnq Feb 13, 2025
2d3eeb5
lint
defelmnq Feb 13, 2025
e17aafc
work on dbauthz
defelmnq Feb 13, 2025
c5a4201
improve dbauthz for fetching
defelmnq Feb 13, 2025
262a672
change dbauthz permissions
defelmnq Feb 13, 2025
dbca96e
finalise tests
defelmnq Feb 13, 2025
3145eab
fix comments from github
defelmnq Feb 13, 2025
3bec324
add collectedAt
defelmnq Feb 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
improve function for resources monitoring
  • Loading branch information
defelmnq committed Feb 10, 2025
commit 7b2d19e95d02d2f9a12a5c6e45e34e9015a0404d
2 changes: 2 additions & 0 deletions agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -785,6 +785,8 @@
// metadata reporting can cease as soon as we start gracefully shutting down
connMan.startAgentAPI("report metadata", gracefulShutdownBehaviorStop, a.reportMetadata)

connMan.startAgentAPI("resources monitor", gracefulShutdownBehaviorStop, a.pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / gen

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / lint

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)) (typecheck)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / lint

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)) (typecheck)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / lint

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring) (typecheck)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / lint

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)) (typecheck)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / lint

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)) (typecheck)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go (ubuntu-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go (ubuntu-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-cli (macos-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-cli (macos-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go (macos-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go (macos-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-pg (ubuntu-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-pg (ubuntu-latest)

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-pg-16

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-pg-16

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-race

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-race

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-go-race-pg

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-e2e

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

Check failure on line 788 in agent/agent.go

View workflow job for this annotation

GitHub Actions / test-e2e-premium

a.pushResourcesMonitoring undefined (type *agent has no field or method pushResourcesMonitoring)

// channels to sync goroutines below
// handle manifest
// |
Expand Down
429 changes: 247 additions & 182 deletions agent/proto/agent.pb.go

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions agent/proto/agent.proto
Original file line number Diff line number Diff line change
Expand Up @@ -310,16 +310,19 @@ message GetResourcesMonitoringConfigurationResponse {

message PushResourcesMonitoringUsageRequest {
message Datapoint {
message MemoryUsage {
int64 used = 1;
int64 total = 2;
}
message VolumeUsage {
string volume = 1;
int32 space_used = 2;
int32 space_total = 3;
int64 used = 2;
int64 total = 3;
}

google.protobuf.Timestamp collected_at = 1;
int32 memory_used = 2;
int32 memory_total =3;
repeated VolumeUsage volumes = 4;
MemoryUsage memory = 2;
repeated VolumeUsage volumes = 3;

}
repeated Datapoint datapoints = 1;
Expand Down
2 changes: 2 additions & 0 deletions agent/proto/agent_drpc_old.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,6 @@ type DRPCAgentClient23 interface {

type DRPCAgentClient24 interface {
DRPCAgentClient23
GetResourcesMonitoringConfiguration(ctx context.Context, in *GetResourcesMonitoringConfigurationRequest) (*GetResourcesMonitoringConfigurationResponse, error)
PushResourcesMonitoringUsage(ctx context.Context, in *PushResourcesMonitoringUsageRequest) (*PushResourcesMonitoringUsageResponse, error)
}
6 changes: 5 additions & 1 deletion coderd/agentapi/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,11 @@ func New(opts Options) *API {
appearanceFetcher: opts.AppearanceFetcher,
}

api.ResourcesMonitoringAPI = &ResourcesMonitoringAPI{}
api.ResourcesMonitoringAPI = &ResourcesMonitoringAPI{
Log: opts.Log,
AgentFn: api.agent,
Database: opts.Database,
}

api.StatsAPI = &StatsAPI{
AgentFn: api.agent,
Expand Down
45 changes: 40 additions & 5 deletions coderd/agentapi/resources_monitoring.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,54 @@ package agentapi

import (
"context"
"database/sql"
"errors"

"golang.org/x/xerrors"

"cdr.dev/slog"
"github.com/coder/coder/v2/agent/proto"
"github.com/coder/coder/v2/coderd/database"
)

type ResourcesMonitoringAPI struct {
AgentFn func(context.Context) (database.WorkspaceAgent, error)
Database database.Store
Log slog.Logger
}

func (a *ResourcesMonitoringAPI) GetResourcesMonitoringConfiguration(ctx context.Context, _ *proto.GetResourcesMonitoringConfigurationRequest) (*proto.GetResourcesMonitoringConfigurationResponse, error) {
return nil, xerrors.Errorf("GetResourcesMonitoringConfiguration is not implemented")
agent, err := a.AgentFn(ctx)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:

I think we could avoid using the AgentFn pattern here. We only need the Agent's ID, but this function makes a call to the database, using the Agent's ID, to fetch the agent. We could instead have the ResourcesMonitoringAPI have an AgentID field and just pass in the AgentID from Options.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit surprised there's a so simple option , but makes sense and changed - Thanks ! ✅

if err != nil {
return nil, err
}

_, err = a.Database.FetchMemoryResourceMonitorsByAgentID(ctx, agent.ID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm confused about something, please help me understand:

If someone has configured a resources monitor like this:

resource "coder_agent" "main" {
  arch = data.coder_provisioner.dev.arch
  os   = data.coder_provisioner.dev.os
  resources_monitoring {
    memory {
      enabled   = false <------------------------------
      threshold = 80
    }
    ...
  }
}

It looks like we'll still be collecting the memory datapoints.
I don't see any mechanism in this PR to disable collection of memory stats.

Ditto for volumes.

Is this by design?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep - I indeed skipped this one on the first proto version which I just changed. New payload seem way better.

  • Added an enabled field for each resource (each volume and the memory)
  • Removed the enabled field from global configuration (more info here)
  • Memory now is optional

return nil, err
}

volumeMonitors, err := a.Database.FetchVolumesResourceMonitorsByAgentID(ctx, agent.ID)
if err != nil {
return nil, err
}

volumes := make([]string, 0, len(volumeMonitors))
for _, monitor := range volumeMonitors {
volumes = append(volumes, monitor.Path)
}

return &proto.GetResourcesMonitoringConfigurationResponse{
Enabled: false,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be useful to reviewers to understand when and how this will be set to true, and why it's false by default now.

Copy link
Contributor Author

@defelmnq defelmnq Feb 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right -

To clarify, this boolean defines if the monitoring is enabled or not - which means are we starting the ticker that will fetch the resources and push it back to coderd.

As the agent PR is part of a bigger topic, including another PR worked on by Danielle - for now monitoring should be turned off.

Once everything merged and we're ready to deploy, we'll just have to turn this one on with the release of coder/coder.

Edit : The logic changed a bit since the globally enabled field has since then been removed as discussed in another topic.

Config: &proto.GetResourcesMonitoringConfigurationResponse_Config{
TickInterval: 10,
NumDatapoints: 20,
},
MonitoredVolumes: volumes,
}, nil
}

func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, _ *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) {
return nil, xerrors.Errorf("PushResourcesMonitoringUsage is not implemented")
func (a *ResourcesMonitoringAPI) PushResourcesMonitoringUsage(ctx context.Context, req *proto.PushResourcesMonitoringUsageRequest) (*proto.PushResourcesMonitoringUsageResponse, error) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

self review : This function is just here as a placeholder - we are fine keeping it this way for now as it will be replaced by @DanielleMaywood PR.

a.Log.Info(ctx, "resources monitoring usage received",
slog.F("request", req))

return &proto.PushResourcesMonitoringUsageResponse{}, nil
}
24 changes: 12 additions & 12 deletions coderd/database/dbauthz/dbauthz.go
Original file line number Diff line number Diff line change
Expand Up @@ -1392,9 +1392,9 @@ func (q *querier) FavoriteWorkspace(ctx context.Context, id uuid.UUID) error {
}

func (q *querier) FetchMemoryResourceMonitorsByAgentID(ctx context.Context, agentID uuid.UUID) (database.WorkspaceAgentMemoryResourceMonitor, error) {
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
return database.WorkspaceAgentMemoryResourceMonitor{}, err
}
// if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
// return database.WorkspaceAgentMemoryResourceMonitor{}, err
// }

return q.db.FetchMemoryResourceMonitorsByAgentID(ctx, agentID)
}
Expand All @@ -1407,9 +1407,9 @@ func (q *querier) FetchNewMessageMetadata(ctx context.Context, arg database.Fetc
}

func (q *querier) FetchVolumesResourceMonitorsByAgentID(ctx context.Context, agentID uuid.UUID) ([]database.WorkspaceAgentVolumeResourceMonitor, error) {
if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
return nil, err
}
// if err := q.authorizeContext(ctx, policy.ActionRead, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
// return nil, err
// }

return q.db.FetchVolumesResourceMonitorsByAgentID(ctx, agentID)
}
Expand Down Expand Up @@ -3020,9 +3020,9 @@ func (q *querier) InsertLicense(ctx context.Context, arg database.InsertLicenseP
}

func (q *querier) InsertMemoryResourceMonitor(ctx context.Context, arg database.InsertMemoryResourceMonitorParams) (database.WorkspaceAgentMemoryResourceMonitor, error) {
if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
return database.WorkspaceAgentMemoryResourceMonitor{}, err
}
// if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
// return database.WorkspaceAgentMemoryResourceMonitor{}, err
// }

return q.db.InsertMemoryResourceMonitor(ctx, arg)
}
Expand Down Expand Up @@ -3220,9 +3220,9 @@ func (q *querier) InsertUserLink(ctx context.Context, arg database.InsertUserLin
}

func (q *querier) InsertVolumeResourceMonitor(ctx context.Context, arg database.InsertVolumeResourceMonitorParams) (database.WorkspaceAgentVolumeResourceMonitor, error) {
if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
return database.WorkspaceAgentVolumeResourceMonitor{}, err
}
// if err := q.authorizeContext(ctx, policy.ActionCreate, rbac.ResourceWorkspaceAgentResourceMonitor); err != nil {
// return database.WorkspaceAgentVolumeResourceMonitor{}, err
// }

return q.db.InsertVolumeResourceMonitor(ctx, arg)
}
Expand Down
8 changes: 8 additions & 0 deletions coderd/database/modelmethods.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,14 @@ func (c OAuth2ProviderAppCode) RBACObject() rbac.Object {
return rbac.ResourceOauth2AppCodeToken.WithOwner(c.UserID.String())
}

func (c WorkspaceAgentMemoryResourceMonitor) RBACObject() rbac.Object {
return rbac.ResourceWorkspaceAgentResourceMonitor.RBACObject()
}

func (c WorkspaceAgentVolumeResourceMonitor) RBACObject() rbac.Object {
return rbac.ResourceWorkspaceAgentResourceMonitor.RBACObject()
}

func (OAuth2ProviderAppSecret) RBACObject() rbac.Object {
return rbac.ResourceOauth2AppSecret
}
Expand Down
2 changes: 1 addition & 1 deletion coderd/workspaceagents_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2041,7 +2041,7 @@ func requireGetManifest(ctx context.Context, t testing.TB, aAPI agentproto.DRPCA
}

func postStartup(ctx context.Context, t testing.TB, client agent.Client, startup *agentproto.Startup) error {
aAPI, _, err := client.ConnectRPC23(ctx)
aAPI, _, err := client.ConnectRPC24(ctx)
require.NoError(t, err)
defer func() {
cErr := aAPI.DRPCConn().Close()
Expand Down
Loading