@@ -13,7 +13,8 @@ class Replication(Plugin):
13
13
AgentPluginType = "pg"
14
14
# key: (macro, value)
15
15
plugin_macros = {
16
- "critical_lag_seconds" : [("macro" , "{$CRITICAL_LAG_SECONDS}" ), ("value" , 60 * 5 )]
16
+ "critical_lag_seconds" : [("macro" , "{$CRITICAL_LAG_SECONDS}" ), ("value" , 60 * 5 )],
17
+ "critical_bytes_held_by_none_active_slot" : [("macro" , "{$CRITICAL_BYTES_HELD_BY_NON_ACTIVE_SLOT}" ), ("value" , 1024 * 1024 * 1024 )]
17
18
}
18
19
19
20
# get time of replication lag
@@ -30,8 +31,15 @@ class Replication(Plugin):
30
31
WHERE active = 'false';
31
32
"""
32
33
34
+ query_bytes_held_by_non_active_slot = """
35
+ SELECT slot_name, coalesce(pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::bigint, 0) AS wal_size_bytes
36
+ FROM pg_replication_slots
37
+ WHERE active = 'false';
38
+ """
39
+
33
40
# for discovery rule for name of each replica
34
41
key_lsn_replication_discovery = "pgsql.replication.discovery{0}"
42
+ key_replication_non_active_slots_discovery = "pgsql.replication.non_active_slots_discovery{0}"
35
43
key_total_lag = "pgsql.replication.total_lag{0}"
36
44
# for PG 10 and higher
37
45
key_flush = "pgsql.replication.flush_lag{0}"
@@ -42,6 +50,7 @@ class Replication(Plugin):
42
50
43
51
key_replication = "pgsql.replication_lag{0}"
44
52
key_non_active_slots = "pgsql.replication.non_active_slots{0}"
53
+ key_non_active_slots_held_bytes = "pgsql.replication.non_active_slots_held_bytes{0}"
45
54
46
55
def run (self , zbx ):
47
56
@@ -79,6 +88,14 @@ def run(self, zbx):
79
88
zbx .send ("pgsql.replication.replay_lag[{0}]" .format (info [0 ]), float (info [5 ]))
80
89
zbx .send ("pgsql.replication.discovery[]" , zbx .json ({"data" : lags }))
81
90
del lags
91
+ bytes_held_by_non_active_slot = Pooler .run_sql_type ("wal_held_bytes_master" , args = [])
92
+ if bytes_held_by_non_active_slot :
93
+ discovery = []
94
+ for info in bytes_held_by_non_active_slot :
95
+ discovery .append ({"{#NON_ACTIVE_SLOT_NAME}" : info [0 ]})
96
+ zbx .send ("pgsql.replication.non_active_slots_held_bytes[{0}]" .format (info [0 ]), int (info [1 ]))
97
+ zbx .send ("pgsql.replication.non_active_slots_discovery[]" , zbx .json ({"data" : discovery }))
98
+ del discovery
82
99
elif Pooler .is_superuser () or Pooler .is_bootstraped ():
83
100
result_lags = Pooler .run_sql_type ("wal_lag_lsn" , args = [" " , "xlog" , "location" ])
84
101
if result_lags :
@@ -90,7 +107,15 @@ def run(self, zbx):
90
107
del lags
91
108
else :
92
109
self .disable_and_exit_if_not_superuser ()
93
-
110
+ else :
111
+ bytes_held_by_non_active_slot = Pooler .run_sql_type ("wal_held_bytes_replica" , args = [])
112
+ if bytes_held_by_non_active_slot :
113
+ discovery = []
114
+ for info in bytes_held_by_non_active_slot :
115
+ discovery .append ({"{#NON_ACTIVE_SLOT_NAME}" : info [0 ]})
116
+ zbx .send ("pgsql.replication.non_active_slots_held_bytes[{0}]" .format (info [0 ]), int (info [1 ]))
117
+ zbx .send ("pgsql.replication.non_active_slots_discovery[]" , zbx .json ({"data" : discovery }))
118
+ del discovery
94
119
non_active_slots = Pooler .query (self .query_non_active_slots )
95
120
zbx .send (self .key_non_active_slots .format ("[]" ), int (non_active_slots [0 ][0 ]))
96
121
@@ -132,7 +157,8 @@ def triggers(self, template, dashboard=False):
132
157
}) + template .trigger ({
133
158
"name" : "PostgreSQL Replication: number of non-active replication slots on {HOSTNAME} (value={ITEM.LASTVALUE})" ,
134
159
"expression" : "{#TEMPLATE:" + self .right_type (self .key_non_active_slots ) + ".last()}>" + str (
135
- NUMBER_NON_ACTIVE_SLOTS )
160
+ NUMBER_NON_ACTIVE_SLOTS ),
161
+ "status" : 1
136
162
})
137
163
return triggers
138
164
@@ -198,7 +224,42 @@ def discovery_rules(self, template, dashboard=False):
198
224
]
199
225
}
200
226
]
201
- return template .discovery_rule (rule = rule , conditions = conditions , items = items , graphs = graphs )
227
+ active_slots_discovery_rule = template .discovery_rule (rule = rule , conditions = conditions , items = items , graphs = graphs )
228
+
229
+ rule = {
230
+ "name" : "PostgreSQL Replication: Non Active Slots Discovery" ,
231
+ "key" : self .key_replication_non_active_slots_discovery .format ("[{0}]" .format (self .Macros [self .Type ]))
232
+ }
233
+ if Plugin .old_zabbix :
234
+ conditions = []
235
+ rule ["filter" ] = "{#NON_ACTIVE_SLOT_NAME}:.*"
236
+ else :
237
+ conditions = [{
238
+ "condition" : [
239
+ {"macro" : "{#NON_ACTIVE_SLOT_NAME}" ,
240
+ "value" : ".*" ,
241
+ "operator" : 8 ,
242
+ "formulaid" : "A" }
243
+ ]
244
+ }]
245
+ items = [
246
+ {"key" : self .right_type (self .key_non_active_slots_held_bytes , var_discovery = "{#NON_ACTIVE_SLOT_NAME}," ),
247
+ "name" : "PostgreSQL Replication: Bytes held by non-active slot {#NON_ACTIVE_SLOT_NAME}" ,
248
+ "value_type" : Plugin .VALUE_TYPE .numeric_float ,
249
+ "delay" : self .plugin_config ("interval" ),
250
+ "drawtype" : 2 }
251
+ ]
252
+ graphs = []
253
+ triggers = [
254
+ {
255
+ "name" : "PostgreSQL Replication: bytes held by slot {#NON_ACTIVE_SLOT_NAME} is too high (value={ITEM.LASTVALUE})" ,
256
+ "expression" : "{#TEMPLATE:" + self .right_type (self .key_non_active_slots_held_bytes , var_discovery = "{#NON_ACTIVE_SLOT_NAME}," ) + ".last()}>" +
257
+ self .plugin_macros ["critical_bytes_held_by_none_active_slot" ][0 ][1 ]
258
+ }
259
+ ]
260
+ non_active_slots_discovery_rule = template .discovery_rule (rule = rule , conditions = conditions , items = items , graphs = graphs , triggers = triggers )
261
+
262
+ return active_slots_discovery_rule + non_active_slots_discovery_rule
202
263
203
264
def keys_and_queries (self , template_zabbix ):
204
265
result = []
0 commit comments