Skip to content

Commit cd110aa

Browse files
authored
Enforce minimum cpu and memory limits (zalando#731)
* add validation for PG resources and volume size * check resource requests also on UPDATE and SYNC + update docs * if cluster was running don't error on sync
1 parent 0628439 commit cd110aa

File tree

8 files changed

+117
-18
lines changed

8 files changed

+117
-18
lines changed

docs/user.md

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ spec:
3030
databases:
3131
foo: zalando
3232
postgresql:
33-
version: "10"
33+
version: "11"
3434
```
3535
3636
Once you cloned the Postgres Operator [repository](https://github.com/zalando/postgres-operator)
@@ -40,6 +40,9 @@ you can find this example also in the manifests folder:
4040
kubectl create -f manifests/minimal-postgres-manifest.yaml
4141
```
4242

43+
Note, that the minimum volume size to run the `postgresql` resource on Elastic
44+
Block Storage (EBS) is `1Gi`.
45+
4346
## Watch pods being created
4447

4548
```bash
@@ -182,6 +185,32 @@ See [infrastructure roles secret](../manifests/infrastructure-roles.yaml)
182185
and [infrastructure roles configmap](../manifests/infrastructure-roles-configmap.yaml)
183186
for the examples.
184187

188+
## Resource definition
189+
190+
The compute resources to be used for the Postgres containers in the pods can be
191+
specified in the postgresql cluster manifest.
192+
193+
```yaml
194+
apiVersion: "acid.zalan.do/v1"
195+
kind: postgresql
196+
metadata:
197+
name: acid-minimal-cluster
198+
spec:
199+
resources:
200+
requests:
201+
cpu: 10m
202+
memory: 100Mi
203+
limits:
204+
cpu: 300m
205+
memory: 300Mi
206+
```
207+
208+
The minimum limit to properly run the `postgresql` resource is `256m` for `cpu`
209+
and `256Mi` for `memory`. If a lower value is set in the manifest the operator
210+
will cancel ADD or UPDATE events on this resource with an error. If no
211+
resources are defined in the manifest the operator will obtain the configured
212+
[default requests](reference/operator_parameters.md#kubernetes-resource-requests).
213+
185214
## Use taints and tolerations for dedicated PostgreSQL nodes
186215

187216
To ensure Postgres pods are running on nodes without any other application pods,
@@ -305,7 +334,7 @@ Things to note:
305334
- There is no way to transform a non-standby cluster to a standby cluster
306335
through the operator. Adding the standby section to the manifest of a running
307336
Postgres cluster will have no effect. However, it can be done through Patroni
308-
by adding the [standby_cluster] (https://github.com/zalando/patroni/blob/bd2c54581abb42a7d3a3da551edf0b8732eefd27/docs/replica_bootstrap.rst#standby-cluster)
337+
by adding the [standby_cluster](https://github.com/zalando/patroni/blob/bd2c54581abb42a7d3a3da551edf0b8732eefd27/docs/replica_bootstrap.rst#standby-cluster)
309338
section using `patronictl edit-config`. Note that the transformed standby
310339
cluster will not be doing any streaming. It will be in standby mode and allow
311340
read-only transactions only.
@@ -384,7 +413,7 @@ specified but globally disabled in the configuration. The
384413

385414
## Increase volume size
386415

387-
PostgreSQL operator supports statefulset volume resize if you're using the
416+
Postgres operator supports statefulset volume resize if you're using the
388417
operator on top of AWS. For that you need to change the size field of the
389418
volume description in the cluster manifest and apply the change:
390419

manifests/standby-manifest.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ spec:
99
size: 1Gi
1010
numberOfInstances: 1
1111
postgresql:
12-
version: "10"
12+
version: "11"
1313
# Make this a standby cluster and provide the s3 bucket path of source cluster for continuous streaming.
1414
standby:
1515
s3_wal_path: "s3://path/to/bucket/containing/wal/of/source/cluster/"

pkg/cluster/cluster.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,10 @@ func (c *Cluster) Create() error {
227227

228228
c.setStatus(acidv1.ClusterStatusCreating)
229229

230+
if err = c.validateResources(&c.Spec); err != nil {
231+
return fmt.Errorf("insufficient resource limits specified: %v", err)
232+
}
233+
230234
for _, role := range []PostgresRole{Master, Replica} {
231235

232236
if c.Endpoints[role] != nil {
@@ -491,6 +495,44 @@ func compareResourcesAssumeFirstNotNil(a *v1.ResourceRequirements, b *v1.Resourc
491495

492496
}
493497

498+
func (c *Cluster) validateResources(spec *acidv1.PostgresSpec) error {
499+
500+
// setting limits too low can cause unnecessary evictions / OOM kills
501+
const (
502+
cpuMinLimit = "256m"
503+
memoryMinLimit = "256Mi"
504+
)
505+
506+
var (
507+
isSmaller bool
508+
err error
509+
)
510+
511+
cpuLimit := spec.Resources.ResourceLimits.CPU
512+
if cpuLimit != "" {
513+
isSmaller, err = util.IsSmallerQuantity(cpuLimit, cpuMinLimit)
514+
if err != nil {
515+
return fmt.Errorf("error validating CPU limit: %v", err)
516+
}
517+
if isSmaller {
518+
return fmt.Errorf("defined CPU limit %s is below required minimum %s to properly run postgresql resource", cpuLimit, cpuMinLimit)
519+
}
520+
}
521+
522+
memoryLimit := spec.Resources.ResourceLimits.Memory
523+
if memoryLimit != "" {
524+
isSmaller, err = util.IsSmallerQuantity(memoryLimit, memoryMinLimit)
525+
if err != nil {
526+
return fmt.Errorf("error validating memory limit: %v", err)
527+
}
528+
if isSmaller {
529+
return fmt.Errorf("defined memory limit %s is below required minimum %s to properly run postgresql resource", memoryLimit, memoryMinLimit)
530+
}
531+
}
532+
533+
return nil
534+
}
535+
494536
// Update changes Kubernetes objects according to the new specification. Unlike the sync case, the missing object
495537
// (i.e. service) is treated as an error
496538
// logical backup cron jobs are an exception: a user-initiated Update can enable a logical backup job
@@ -501,6 +543,7 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
501543
c.mu.Lock()
502544
defer c.mu.Unlock()
503545

546+
oldStatus := c.Status
504547
c.setStatus(acidv1.ClusterStatusUpdating)
505548
c.setSpec(newSpec)
506549

@@ -512,6 +555,22 @@ func (c *Cluster) Update(oldSpec, newSpec *acidv1.Postgresql) error {
512555
}
513556
}()
514557

558+
if err := c.validateResources(&newSpec.Spec); err != nil {
559+
err = fmt.Errorf("insufficient resource limits specified: %v", err)
560+
561+
// cancel update only when (already too low) pod resources were edited
562+
// if cluster was successfully running before the update, continue but log a warning
563+
isCPULimitSmaller, err2 := util.IsSmallerQuantity(newSpec.Spec.Resources.ResourceLimits.CPU, oldSpec.Spec.Resources.ResourceLimits.CPU)
564+
isMemoryLimitSmaller, err3 := util.IsSmallerQuantity(newSpec.Spec.Resources.ResourceLimits.Memory, oldSpec.Spec.Resources.ResourceLimits.Memory)
565+
566+
if oldStatus.Running() && !isCPULimitSmaller && !isMemoryLimitSmaller && err2 == nil && err3 == nil {
567+
c.logger.Warning(err)
568+
} else {
569+
updateFailed = true
570+
return err
571+
}
572+
}
573+
515574
if oldSpec.Spec.PgVersion != newSpec.Spec.PgVersion { // PG versions comparison
516575
c.logger.Warningf("postgresql version change(%q -> %q) has no effect", oldSpec.Spec.PgVersion, newSpec.Spec.PgVersion)
517576
//we need that hack to generate statefulset with the old version

pkg/cluster/k8sres.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -741,7 +741,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
741741
limit = c.OpConfig.DefaultMemoryLimit
742742
}
743743

744-
isSmaller, err := util.RequestIsSmallerThanLimit(request, limit)
744+
isSmaller, err := util.IsSmallerQuantity(request, limit)
745745
if err != nil {
746746
return nil, err
747747
}
@@ -768,7 +768,7 @@ func (c *Cluster) generateStatefulSet(spec *acidv1.PostgresSpec) (*appsv1.Statef
768768
limit = c.OpConfig.DefaultMemoryLimit
769769
}
770770

771-
isSmaller, err := util.RequestIsSmallerThanLimit(sidecarRequest, sidecarLimit)
771+
isSmaller, err := util.IsSmallerQuantity(sidecarRequest, sidecarLimit)
772772
if err != nil {
773773
return nil, err
774774
}

pkg/cluster/sync.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
2323
c.mu.Lock()
2424
defer c.mu.Unlock()
2525

26+
oldStatus := c.Status
2627
c.setSpec(newSpec)
2728

2829
defer func() {
@@ -34,6 +35,16 @@ func (c *Cluster) Sync(newSpec *acidv1.Postgresql) error {
3435
}
3536
}()
3637

38+
if err = c.validateResources(&c.Spec); err != nil {
39+
err = fmt.Errorf("insufficient resource limits specified: %v", err)
40+
if oldStatus.Running() {
41+
c.logger.Warning(err)
42+
err = nil
43+
} else {
44+
return err
45+
}
46+
}
47+
3748
if err = c.initUsers(); err != nil {
3849
err = fmt.Errorf("could not init users: %v", err)
3950
return err

pkg/controller/controller.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ func (c *Controller) initOperatorConfig() {
111111

112112
if c.opConfig.SetMemoryRequestToLimit {
113113

114-
isSmaller, err := util.RequestIsSmallerThanLimit(c.opConfig.DefaultMemoryRequest, c.opConfig.DefaultMemoryLimit)
114+
isSmaller, err := util.IsSmallerQuantity(c.opConfig.DefaultMemoryRequest, c.opConfig.DefaultMemoryLimit)
115115
if err != nil {
116116
panic(err)
117117
}
@@ -120,7 +120,7 @@ func (c *Controller) initOperatorConfig() {
120120
c.opConfig.DefaultMemoryRequest = c.opConfig.DefaultMemoryLimit
121121
}
122122

123-
isSmaller, err = util.RequestIsSmallerThanLimit(c.opConfig.ScalyrMemoryRequest, c.opConfig.ScalyrMemoryLimit)
123+
isSmaller, err = util.IsSmallerQuantity(c.opConfig.ScalyrMemoryRequest, c.opConfig.ScalyrMemoryLimit)
124124
if err != nil {
125125
panic(err)
126126
}

pkg/util/util.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -141,17 +141,17 @@ func Coalesce(val, defaultVal string) string {
141141
return val
142142
}
143143

144-
// RequestIsSmallerThanLimit : ...
145-
func RequestIsSmallerThanLimit(requestStr, limitStr string) (bool, error) {
144+
// IsSmallerQuantity : checks if first resource is of a smaller quantity than the second
145+
func IsSmallerQuantity(requestStr, limitStr string) (bool, error) {
146146

147147
request, err := resource.ParseQuantity(requestStr)
148148
if err != nil {
149-
return false, fmt.Errorf("could not parse memory request %v : %v", requestStr, err)
149+
return false, fmt.Errorf("could not parse request %v : %v", requestStr, err)
150150
}
151151

152152
limit, err2 := resource.ParseQuantity(limitStr)
153153
if err2 != nil {
154-
return false, fmt.Errorf("could not parse memory limit %v : %v", limitStr, err2)
154+
return false, fmt.Errorf("could not parse limit %v : %v", limitStr, err2)
155155
}
156156

157157
return request.Cmp(limit) == -1, nil

pkg/util/util_test.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ var substringMatch = []struct {
6969
{regexp.MustCompile(`aaaa (\d+) bbbb`), "aaaa 123 bbbb", nil},
7070
}
7171

72-
var requestIsSmallerThanLimitTests = []struct {
72+
var requestIsSmallerQuantityTests = []struct {
7373
request string
7474
limit string
7575
out bool
@@ -155,14 +155,14 @@ func TestMapContains(t *testing.T) {
155155
}
156156
}
157157

158-
func TestRequestIsSmallerThanLimit(t *testing.T) {
159-
for _, tt := range requestIsSmallerThanLimitTests {
160-
res, err := RequestIsSmallerThanLimit(tt.request, tt.limit)
158+
func TestIsSmallerQuantity(t *testing.T) {
159+
for _, tt := range requestIsSmallerQuantityTests {
160+
res, err := IsSmallerQuantity(tt.request, tt.limit)
161161
if err != nil {
162-
t.Errorf("RequestIsSmallerThanLimit returned unexpected error: %#v", err)
162+
t.Errorf("IsSmallerQuantity returned unexpected error: %#v", err)
163163
}
164164
if res != tt.out {
165-
t.Errorf("RequestIsSmallerThanLimit expected: %#v, got: %#v", tt.out, res)
165+
t.Errorf("IsSmallerQuantity expected: %#v, got: %#v", tt.out, res)
166166
}
167167
}
168168
}

0 commit comments

Comments
 (0)