@@ -190,32 +190,59 @@ $j = @()
190
190
$j += start-job - Name ' Basic Health Checks' {
191
191
192
192
# nodes up
193
-
194
193
$cn = Get-ClusterNode
195
194
196
195
if ($cn.count -eq $ ($cn | ? State -eq Up).count) {
197
- write-host - fore green All cluster nodes Up
196
+ write-host - ForegroundColor Green All cluster nodes Up
198
197
} else {
199
- write-host - fore red Following cluster nodes not Up:
198
+ write-host - ForegroundColor Red Following cluster nodes not Up:
200
199
$cn | ? State -ne Up
201
200
}
202
201
202
+ # node uptime
203
+ $o = icm ($cn | ? State -eq Up) {
204
+ $w = gwmi win32_operatingsystem
205
+ $w.ConvertToDateTime ($w.localdatetime ) - $w.ConvertToDateTime ($w.lastbootuptime )
206
+ }
207
+
208
+ $reboots = $o | ? TotalHours -lt 1
209
+
210
+ if ($reboots.length -and $reboots.length -ne $o.length ) {
211
+ write-host - ForegroundColor Yellow WARNING: $reboots.length nodes have rebooted in the last hour. Ensure that
212
+ write-host - ForegroundColor Yellow `t no unexpected events are occuring in the cluster.
213
+ }
214
+
215
+ write-host - ForegroundColor Green Cluster node uptime:
216
+ $o | sort PsComputerName | ft PsComputerName, @ { Label = " Uptime" ; Expression = {" {0}d:{1:00}h:{2:00}m.{3:00}s" -f $_.Days , $_.Hours , $_.Minutes , $_.Seconds }}
217
+
218
+ # subsystem check
203
219
$ss = Get-StorageSubSystem | ? Model -eq ' Clustered Windows Storage'
204
220
205
221
if (($ss | measure).count -ne 1 ) {
206
- write-host - fore red Expected single clustered storage subsystem, found:
222
+ write-host - ForegroundColor Red Expected single clustered storage subsystem, found:
207
223
$ss | ft - autosize
208
224
return
209
225
}
210
226
211
- # pool health
227
+ $ssuh = $ss | ? HealthStatus -ne Healthy
212
228
229
+ if ($ssuh ) {
230
+ write-host - ForegroundColor Red WARNING: clustered storage subsystem is not healthy
231
+ $ssuh | ft - AutoSize
232
+
233
+ write-host - ForegroundColor Red Output of Debug-StorageSubSystem follows
234
+ $ssuh | Debug-StorageSubSystem
235
+ } else {
236
+ write-host - ForegroundColor Green Clustered storage subsystem Healthy
237
+ }
238
+
239
+ # pool health
213
240
$p = $ss | Get-StoragePool | ? IsPrimordial -ne $true | ? HealthStatus -ne Healthy
214
241
215
242
if ($p -eq $null ) {
216
- write-host - fore green All operational pools Healthy
243
+ write-host - ForegroundColor Green All pools Healthy
217
244
} else {
218
- write-host - fore red Following pools not Healthy:
245
+ write-host - ForegroundColor Red Following pools not Healthy:
219
246
$p | ft - autosize
220
247
}
221
248
}
@@ -309,9 +336,9 @@ $j += start-job -name 'Operational Issues and Storage Jobs' -ArgumentList $Clean
309
336
# Storage Jobs
310
337
311
338
$sj = get-storagejob
312
- if ($sj ) {
339
+ if ($sj | ? JobState -ne Completed ) {
313
340
write-host - ForegroundColor red WARNING: there are active storage jobs running. Investigate the root cause before continuing.
314
- get-storagejob | ft - autosize
341
+ $sj | ft - autosize
315
342
} else {
316
343
write-host - fore green No storage rebuild or regeneration jobs are active
317
344
}
@@ -393,6 +420,9 @@ if ($roce) {
393
420
394
421
$j += start-job - name ' RoCE: Mellanox Error Check' {
395
422
423
+ $r = $null
424
+ $pc = $null
425
+
396
426
switch ($using :drvdesc ) {
397
427
" Mellanox ConnectX-3 Pro Ethernet Adapter" {
398
428
$pc = @ {
@@ -401,8 +431,8 @@ if ($roce) {
401
431
' \Mellanox Adapter Traffic Counters(_Total)\Packets Received Frame Length Error' = ' Rec FrmLenErr' ;
402
432
' \Mellanox Adapter Traffic Counters(_Total)\Packets Received Symbol Error' = ' Rec SymlErr' ;
403
433
' \Mellanox Adapter Traffic Counters(_Total)\Packets Received Discarded' = ' Rec Discard' ;
404
- ' \Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Discarded' = ' Outbnd Err '
405
- ' \Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Errors' = ' Outbnd Discard ' ;
434
+ ' \Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Discarded' = ' Outbnd Discard '
435
+ ' \Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Errors' = ' Outbnd Err ' ;
406
436
}
407
437
}
408
438
" Mellanox ConnectX-4 Adapter" {
@@ -412,30 +442,35 @@ if ($roce) {
412
442
' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Received Frame Length Error' = ' Rec FrmLenErr' ;
413
443
' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Received Symbol Error' = ' Rec SymlErr' ;
414
444
' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Received Discarded' = ' Rec Discard' ;
415
- ' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Discarded' = ' Outbnd Err '
416
- ' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Errors' = ' Outbnd Discard ' ;
445
+ ' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Discarded' = ' Outbnd Discard '
446
+ ' \Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Errors' = ' Outbnd Err ' ;
417
447
}
418
448
}
419
449
default {
420
450
write-host - ForegroundColor Red " Unknown adapter type: $ ( $using :drvdesc ) "
421
451
}
422
452
}
423
453
424
- $r = icm (get-clusternode | ? State -eq Up) - ArgumentList $pc {
454
+ # no counters, no results
455
+
456
+ if ($pc -ne $null ) {
457
+
458
+ $r = icm (get-clusternode | ? State -eq Up) - ArgumentList $pc {
425
459
426
- param ( $pc )
460
+ param ($pc )
427
461
428
- $c = get-counter ($pc.Keys | % { $_ }) - ErrorAction SilentlyContinue
429
- if ($c ) {
462
+ $c = get-counter ($pc.Keys | % { $_ }) - ErrorAction SilentlyContinue
463
+ if ($c ) {
430
464
431
- $o = new-object psobject - Property @ { ' Errors' = $false }
432
- $c.CounterSamples | sort - Property Path | % {
433
- if ($_.path -match ' \\\\[^\\]+(\\.*$)' ) {
434
- $o | Add-Member - NotePropertyName $pc [$matches [1 ]] - NotePropertyValue $_.CookedValue
435
- if ($_.CookedValue -ne 0 ) { $o.Errors = $true }
465
+ $o = new-object psobject - Property @ { ' Errors' = $false }
466
+ $c.CounterSamples | sort - Property Path | % {
467
+ if ($_.path -match ' \\\\[^\\]+(\\.*$)' ) {
468
+ $o | Add-Member - NotePropertyName $pc [$matches [1 ]] - NotePropertyValue $_.CookedValue
469
+ if ($_.CookedValue -ne 0 ) { $o.Errors = $true }
470
+ }
436
471
}
472
+ $o
437
473
}
438
- $o
439
474
}
440
475
}
441
476
@@ -501,7 +536,7 @@ $f += ,(new-namedblock 'Selected & Non-Failed' { $_.Selected -and -not $_.Failed
501
536
$j += start-job - InitializationScript $fns - Name $t.name { do - clustersymmetry $using :t $using :f }
502
537
503
538
# ##
504
- $t = new-namedblock ' SMB CSV Multichannel Symmetry Check' { Get-SmbMultichannelConnection - SmbInstance SBL }
539
+ $t = new-namedblock ' SMB CSV Multichannel Symmetry Check' { Get-SmbMultichannelConnection - SmbInstance CSV }
505
540
$f = @ ($totalf )
506
541
$f += , (new-namedblock ' RDMA Capable' { $_.ClientRdmaCapable -and $_.ServerRdmaCapable } - nullpass:$ (-not $rdma ))
507
542
$f += , (new-namedblock ' Selected & Non-Failed' { $_.Selected -and -not $_.Failed } - nullpass:$ (-not $rdma ))
0 commit comments