Skip to content

Commit 40a91c5

Browse files
benjaminjbtjmoore4
authored andcommitted
Update exporter dashboards (CrunchyData#158)
Issue: [sc-15707]
1 parent 1b921f7 commit 40a91c5

12 files changed

+67
-29
lines changed

kustomize/monitoring/alertmanager-config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ data:
33
alertmanager.yml: |
44
###
55
#
6-
# Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
6+
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
77
#
88
###
99
@@ -31,7 +31,7 @@ data:
3131
receivers:
3232
- name: 'default-receiver'
3333
email_configs:
34-
- to: 'example@yourcompany.com'
34+
- to: 'example@crunchydata.com'
3535
send_resolved: true
3636
3737
## Examples of alternative alert receivers. See documentation for more info on how to configure these fully

kustomize/monitoring/alertmanager-rules-config.yaml

Lines changed: 48 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,19 @@ data:
2323
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
2424
2525
26+
########## SYSTEM RULES ##########
27+
- alert: ExporterDown
28+
expr: avg_over_time(up[5m]) < 0.5
29+
for: 10s
30+
labels:
31+
service: system
32+
severity: critical
33+
severity_num: 300
34+
annotations:
35+
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
36+
summary: 'Prometheus Exporter Service Down'
37+
38+
2639
########## POSTGRESQL RULES ##########
2740
- alert: PGIsUp
2841
expr: pg_up < 1
@@ -173,6 +186,27 @@ data:
173186
description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
174187
summary: 'PGSQL Instance connections'
175188
189+
- alert: DiskFillPredict
190+
expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
191+
for: 5m
192+
labels:
193+
service: postgresql
194+
severity: warning
195+
severity_num: 200
196+
annotations:
197+
summary: 'Disk predicted to be full in 24 hours'
198+
description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'
199+
200+
- alert: PGClusterRoleChange
201+
expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
202+
for: 60s
203+
labels:
204+
service: postgresql
205+
severity: critical
206+
severity_num: 300
207+
annotations:
208+
summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'
209+
176210
- alert: PGDiskSize
177211
expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
178212
for: 60s
@@ -196,7 +230,7 @@ data:
196230
summary: 'PGSQL Instance size critical'
197231
198232
- alert: PGReplicationByteLag
199-
expr: ccp_replication_status_byte_lag > 5.24288e+07
233+
expr: ccp_replication_lag_size_bytes > 5.24288e+07
200234
for: 60s
201235
labels:
202236
service: postgresql
@@ -207,7 +241,7 @@ data:
207241
summary: 'PGSQL Instance replica lag warning'
208242
209243
- alert: PGReplicationByteLag
210-
expr: ccp_replication_status_byte_lag > 1.048576e+08
244+
expr: ccp_replication_lag_size_bytes > 1.048576e+08
211245
for: 60s
212246
labels:
213247
service: postgresql
@@ -313,12 +347,15 @@ data:
313347
# Otherwise rule will be applied to all stanzas returned on target system if not set.
314348
#
315349
# Relevant metric names are:
316-
# ccp_backrest_last_full_time_since_completion_seconds
317-
# ccp_backrest_last_incr_time_since_completion_seconds
318-
# ccp_backrest_last_diff_time_since_completion_seconds
350+
# ccp_backrest_last_full_backup_time_since_completion_seconds
351+
# ccp_backrest_last_incr_backup_time_since_completion_seconds
352+
# ccp_backrest_last_diff_backup_time_since_completion_seconds
353+
#
354+
# To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
355+
# Further adjustment may be needed depending on your backup runtimes/schedule.
319356
#
320357
# - alert: PGBackRestLastCompletedFull_main
321-
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 604800
358+
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
322359
# for: 60s
323360
# labels:
324361
# service: postgresql
@@ -328,7 +365,7 @@ data:
328365
# summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
329366
#
330367
# - alert: PGBackRestLastCompletedIncr_main
331-
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 86400
368+
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
332369
# for: 60s
333370
# labels:
334371
# service: postgresql
@@ -340,14 +377,14 @@ data:
340377
#
341378
# Runtime monitoring is handled with a single metric:
342379
#
343-
# ccp_backrest_last_runtime_backup_runtime_seconds
380+
# ccp_backrest_last_info_backup_runtime_seconds
344381
#
345382
# Runtime monitoring should have the "backup_type" label set.
346383
# Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
347384
# Stanza should also be set if runtimes per stanza have different expected times
348385
#
349386
# - alert: PGBackRestLastRuntimeFull_main
350-
# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
387+
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
351388
# for: 60s
352389
# labels:
353390
# service: postgresql
@@ -357,7 +394,7 @@ data:
357394
# summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
358395
#
359396
# - alert: PGBackRestLastRuntimeDiff_main
360-
# expr: ccp_backrest_last_runtime_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
397+
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
361398
# for: 60s
362399
# labels:
363400
# service: postgresql
@@ -382,6 +419,7 @@ data:
382419
# severity_num: 300
383420
# annotations:
384421
# description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'
422+
385423
kind: ConfigMap
386424
metadata:
387425
labels:

kustomize/monitoring/crunchy_grafana_dashboards.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
###
22
#
3-
# Copyright 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
3+
# Copyright © 2017-2022 Crunchy Data Solutions, Inc. All Rights Reserved.
44
#
55
###
66
apiVersion: 1
@@ -13,4 +13,4 @@ providers:
1313
disableDeletion: false
1414
updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards
1515
options:
16-
path: $GF_PATHS_PROVISIONING/dashboards
16+
path: /etc/grafana/provisioning/dashboards

kustomize/monitoring/dashboards/pgbackrest.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
"gnetId": null,
5353
"graphTooltip": 0,
5454
"id": null,
55-
"iteration": 1624546649377,
55+
"iteration": 1625069660860,
5656
"links": [
5757
{
5858
"asDropdown": false,
@@ -664,7 +664,7 @@
664664
]
665665
},
666666
"time": {
667-
"from": "now-30m",
667+
"from": "now-2w",
668668
"to": "now"
669669
},
670670
"timepicker": {

kustomize/monitoring/dashboards/pod_details.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@
4242
}
4343
]
4444
},
45-
"editable": true,
45+
"editable": false,
4646
"gnetId": null,
4747
"graphTooltip": 0,
4848
"id": null,
49-
"iteration": 1624647381559,
49+
"iteration": 1625069717503,
5050
"links": [
5151
{
5252
"icon": "external link",

kustomize/monitoring/dashboards/postgres_overview.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
"gnetId": null,
4747
"graphTooltip": 0,
4848
"id": null,
49-
"iteration": 1624491413218,
49+
"iteration": 1625069480601,
5050
"links": [],
5151
"panels": [
5252
{

kustomize/monitoring/dashboards/postgresql_details.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@
5454
}
5555
]
5656
},
57-
"editable": true,
57+
"editable": false,
5858
"gnetId": null,
5959
"graphTooltip": 0,
6060
"id": null,
61-
"iteration": 1624495934950,
61+
"iteration": 1625069813048,
6262
"links": [
6363
{
6464
"asDropdown": false,
@@ -2143,6 +2143,6 @@
21432143
},
21442144
"timezone": "browser",
21452145
"title": "PostgreSQLDetails",
2146-
"uid": "pc4NNgknk",
2146+
"uid": "fMip0cuMk",
21472147
"version": 1
21482148
}

kustomize/monitoring/dashboards/postgresql_service_health.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,11 @@
4242
}
4343
]
4444
},
45-
"editable": true,
45+
"editable": false,
4646
"gnetId": null,
4747
"graphTooltip": 0,
4848
"id": null,
49-
"iteration": 1624491530019,
49+
"iteration": 1625069909806,
5050
"links": [
5151
{
5252
"asDropdown": false,
@@ -626,7 +626,7 @@
626626
]
627627
},
628628
"time": {
629-
"from": "now-30m",
629+
"from": "now-1h",
630630
"to": "now"
631631
},
632632
"timepicker": {

kustomize/monitoring/dashboards/prometheus_alerts.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -938,7 +938,7 @@
938938
"list": []
939939
},
940940
"time": {
941-
"from": "now-30m",
941+
"from": "now-1h",
942942
"to": "now"
943943
},
944944
"timepicker": {

kustomize/monitoring/dashboards/query_statistics.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
"gnetId": null,
6060
"graphTooltip": 0,
6161
"id": null,
62-
"iteration": 1624501789811,
62+
"iteration": 1625070004605,
6363
"links": [
6464
{
6565
"icon": "external link",

kustomize/monitoring/deploy-grafana.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ spec:
3737
value: crunchy-prometheus
3838
- name: PROM_PORT
3939
value: "9090"
40-
image: grafana/grafana:7.4.5
40+
image: grafana/grafana:8.5.10
4141
imagePullPolicy: IfNotPresent
4242
livenessProbe:
4343
failureThreshold: 3

kustomize/monitoring/deploy-prometheus.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ spec:
2727
name: crunchy-prometheus
2828
spec:
2929
containers:
30-
- image: prom/prometheus:v2.27.1
30+
- image: prom/prometheus:v2.33.5
3131
imagePullPolicy: IfNotPresent
3232
livenessProbe:
3333
failureThreshold: 3

0 commit comments

Comments
 (0)