Skip to content

Commit 48974eb

Browse files
committed
get-cluspc: handle pre-existing destinations, work on single node clusters
run & run-sweeptemplate, and start-sweep: introduce rate limited run capability test-clusterhealth: cluster uptime, make mellanox roce perfctr checks resilient to counter absence watch-cluster: add more ctrs to hyperv, allow aggregate average of appropriate counters (like hyperv av cpu)
1 parent 41d8079 commit 48974eb

File tree

6 files changed

+151
-47
lines changed

6 files changed

+151
-47
lines changed

Frameworks/VMFleet/get-cluspc.ps1

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,19 @@ param(
3232
[ValidateRange(1,[int]::MaxValue)]
3333
[int] $seconds = $(throw "please specify a number of seconds to capture for"),
3434
[string] $destination = $(throw "please specify a destination for the blg zip"),
35-
[int] $sampleinterval = 1
35+
[int] $sampleinterval = 1,
36+
[switch] $force = $false
3637
)
3738

39+
if (gi -ErrorAction SilentlyContinue $destination) {
40+
if (-not $force) {
41+
Write-Error "$destination already exists, please delete or use -Force to overwrite"
42+
return
43+
} else {
44+
del -ErrorAction SilentlyContinue $destination
45+
}
46+
}
47+
3848
$sets = @{
3949
'PhysicalDisk' = '\PhysicalDisk(*)\*','+getclusport';
4050
'CSVFS' = '\Cluster CSVFS(*)\*','\Cluster CSV Volume Cache(*)\*','\Cluster CSV Volume Manager(*)\*','\Cluster CSVFS Block Cache(*)\*','\Cluster CSVFS Direct IO(*)\*','\Cluster CSVFS Redirected IO(*)\*','+getcsv';
@@ -131,20 +141,26 @@ icm (get-clusternode) -ArgumentList (get-command start-logman) {
131141

132142
sleep $seconds
133143

134-
$f = icm (get-clusternode) -ArgumentList (get-command stop-logman) {
144+
# now capture all counter files
145+
$f = @()
146+
$f += icm (get-clusternode) -ArgumentList (get-command stop-logman) {
135147

136148
param($fn)
137149
set-item -path function:\$($fn.name) -value $fn.definition
138150

139151
stop-logman $env:COMPUTERNAME $using:addspec $using:destination
140152
}
153+
154+
# add all counter to the cleanup step
141155
$cleanup += $f
142156

143157
#--
144158
# specials
145159
#--
146160

147-
# make capture directory
161+
# make capture directory, and add to cleanup list
162+
# note that all specials are generated into this directory,
163+
# and will be automatically cleaned up when it is deleted
148164
$t = New-TemporaryFile
149165
del $t
150166
$null = md $t

Frameworks/VMFleet/run-sweeptemplate.ps1

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ SOFTWARE.
3030
# buffer size/alighment, threads/target, outstanding/thread, write%
3131
$b = __b__; $t = __t__; $o = __o__; $w = __w__
3232

33+
# optional - specify rate limit in iops, translated to bytes/ms for DISKSPD
34+
$iops = __iops__
35+
if ($iops -ne $null) { $g = $iops * $b * 1KB / 1000 }
36+
3337
# io pattern, (r)andom or (s)equential (si as needed for multithread)
3438
$p = '__p__'
3539

@@ -38,7 +42,9 @@ $d = __d__; $cool = __Cool__; $warm = __Warm__
3842

3943
# sweep template always captures
4044
$addspec = '__AddSpec__'
41-
$result = "result-b$($b)t$($t)o$($o)w$($w)p$($p)-$($addspec)-$(gc c:\vmspec.txt).xml"
45+
$gspec = $null
46+
if ($g -ne $null) { $gspec = "g$($g)" }
47+
$result = "result-b$($b)t$($t)o$($o)w$($w)p$($p)$($gspec)-$($addspec)-$(gc c:\vmspec.txt).xml"
4248
$dresult = "l:\result"
4349
$lresultf = join-path "c:\run" $result
4450
$dresultf = join-path $dresult $result
@@ -48,7 +54,10 @@ $dresultf = join-path $dresult $result
4854
if (-not (gi $dresultf -ErrorAction SilentlyContinue)) {
4955

5056
$res = 'xml'
51-
C:\run\diskspd.exe -Z20M -z -h `-t$t `-o$o `-b$($b)k `-$($p)$($b)k `-w$w `-W$warm `-C$cool `-d$($d) -D -L `-R$res (dir C:\run\testfile?.dat) > $lresultf
57+
$gspec = $null
58+
if ($g -ne $null) { $gspec = "-g$($g)" }
59+
60+
C:\run\diskspd.exe -Z20M -z -h `-t$t `-o$o $gspec `-b$($b)k `-$($p)$($b)k `-w$w `-W$warm `-C$cool `-d$($d) -D -L `-R$res (dir C:\run\testfile?.dat) > $lresultf
5261

5362
# export result and indicate done flag to master
5463
# use unbuffered copy to force IO downstream

Frameworks/VMFleet/run.ps1

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,11 @@ SOFTWARE.
2828
[string](get-date)
2929

3030
# buffer size/alighment, threads/target, outstanding/thread, write%
31-
$b = 4; $t = 1; $o = 8; $w = 0
31+
$b = 4; $t = 1; $o = 8; $w = 10
32+
33+
# optional - specify rate limit in iops, translated to bytes/ms for DISKSPD
34+
$iops = 500
35+
if ($iops -ne $null) { $g = $iops * $b * 1KB / 1000 }
3236

3337
# io pattern, (r)andom or (s)equential (si as needed for multithread)
3438
$p = 'r'
@@ -37,7 +41,9 @@ $p = 'r'
3741
$d = 30*60; $cool = 30; $warm = 60
3842

3943
$addspec = 'base'
40-
$result = "result-b$($b)t$($t)o$($o)w$($w)p$($p)-$($addspec)-$(gc c:\vmspec.txt).xml"
44+
$gspec = $null
45+
if ($g -ne $null) { $gspec = "g$($g)" }
46+
$result = "result-b$($b)t$($t)o$($o)w$($w)p$($p)$($gspec)-$($addspec)-$(gc c:\vmspec.txt).xml"
4147
$dresult = "l:\result"
4248
$lresultf = join-path "c:\run" $result
4349
$dresultf = join-path $dresult $result
@@ -55,7 +61,9 @@ if (-not (gi $dresultf -ErrorAction SilentlyContinue)) {
5561
$res = 'text'
5662
}
5763

58-
$o = C:\run\diskspd.exe -Z20M -z -h `-t$t `-o$o `-b$($b)k `-$($p)$($b)k `-w$w `-W$warm `-C$cool `-d$($d) -D -L `-R$res (dir C:\run\testfile?.dat)
64+
$gspec = $null
65+
if ($g -ne $null) { $gspec = "-g$($g)" }
66+
$o = C:\run\diskspd.exe -Z20M -z -h `-t$t `-o$o $gspec `-b$($b)k `-$($p)$($b)k `-w$w `-W$warm `-C$cool `-d$($d) -D -L `-R$res (dir C:\run\testfile?.dat)
5967

6068
if ($cap) {
6169

Frameworks/VMFleet/start-sweep.ps1

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ param(
3838
[int[]] $o,
3939
[Parameter(Mandatory =$true)]
4040
[int[]] $w,
41+
[int[]] $iops = $null,
4142
[ValidateSet('r','s','si')]
4243
[string[]] $p = 'r',
4344
[ValidateRange(1,[int]::MaxValue)]
@@ -69,7 +70,11 @@ class variable {
6970
# current value of the variable ("red"/"blue"/"green")
7071
[object] value()
7172
{
72-
return $this._list[$this._ordinal]
73+
if ($this._list.count -gt 0) {
74+
return $this._list[$this._ordinal]
75+
} else {
76+
return $null
77+
}
7378
}
7479

7580
# label/name of the variable ("color")
@@ -82,6 +87,12 @@ class variable {
8287
# has occured (overflow)
8388
[bool] increment()
8489
{
90+
# empty list passes through
91+
if ($this._list.Count -eq 0) {
92+
return $true
93+
}
94+
95+
# non-empty list, increment
8596
$this._ordinal += 1
8697
if ($this._ordinal -eq $this._list.Count) {
8798
$this._ordinal = 0
@@ -172,7 +183,10 @@ class variableset {
172183
$str = ''
173184
}
174185

175-
"$pfx$str" + $this._set[$lookstr].value()
186+
# only produce labels for non-null values
187+
if ($this._set[$lookstr].value() -ne $null) {
188+
"$pfx$str" + $this._set[$lookstr].value()
189+
}
176190
}) -join $null
177191
}
178192
}
@@ -217,7 +231,13 @@ function new-runfile(
217231
$line = $_
218232

219233
foreach ($v in $vs.getset()) {
220-
$line = $line -replace "__$($v.label())__",$v.value()
234+
# non-null goes in as is, null goes in as evaluatable $null
235+
if ($v.value() -ne $null) {
236+
$vsub = $v.value()
237+
} else {
238+
$vsub = '$null'
239+
}
240+
$line = $line -replace "__$($v.label())__",$vsub
221241
}
222242

223243
$line
@@ -232,7 +252,12 @@ function show-run(
232252
# show current substitions (minus the underscore bracketing)
233253
write-host -fore green RUN SPEC `@ (get-date)
234254
foreach ($v in $vs.getset()) {
235-
write-host -fore green "`t$($v.label()) = $($v.value())"
255+
if ($v.value() -ne $null) {
256+
$vsub = $v.value()
257+
} else {
258+
$vsub = '$null'
259+
}
260+
write-host -fore green "`t$($v.label()) = $($vsub)"
236261
}
237262
}
238263

@@ -460,6 +485,7 @@ $v += [variable]::new('t',$t)
460485
$v += [variable]::new('o',$o)
461486
$v += [variable]::new('w',$w)
462487
$v += [variable]::new('p',$p)
488+
$v += [variable]::new('iops',$iops)
463489
$v += [variable]::new('d',$d)
464490
$v += [variable]::new('Warm',$warm)
465491
$v += [variable]::new('Cool',$cool)

Frameworks/VMFleet/test-clusterhealth.ps1

Lines changed: 59 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -190,32 +190,59 @@ $j = @()
190190
$j += start-job -Name 'Basic Health Checks' {
191191

192192
# nodes up
193-
194193
$cn = Get-ClusterNode
195194

196195
if ($cn.count -eq $($cn |? State -eq Up).count) {
197-
write-host -fore green All cluster nodes Up
196+
write-host -ForegroundColor Green All cluster nodes Up
198197
} else {
199-
write-host -fore red Following cluster nodes not Up:
198+
write-host -ForegroundColor Red Following cluster nodes not Up:
200199
$cn |? State -ne Up
201200
}
202201

202+
# node uptime
203+
$o = icm ($cn |? State -eq Up) {
204+
$w = gwmi win32_operatingsystem
205+
$w.ConvertToDateTime($w.localdatetime) - $w.ConvertToDateTime($w.lastbootuptime)
206+
}
207+
208+
$reboots = $o |? TotalHours -lt 1
209+
210+
if ($reboots.length -and $reboots.length -ne $o.length) {
211+
write-host -ForegroundColor Yellow WARNING: $reboots.length nodes have rebooted in the last hour. Ensure that
212+
write-host -ForegroundColor Yellow `t no unexpected events are occuring in the cluster.
213+
}
214+
215+
write-host -ForegroundColor Green Cluster node uptime:
216+
$o | sort PsComputerName | ft PsComputerName,@{ Label="Uptime"; Expression={"{0}d:{1:00}h:{2:00}m.{3:00}s" -f $_.Days,$_.Hours,$_.Minutes,$_.Seconds}}
217+
218+
# subsystem check
203219
$ss = Get-StorageSubSystem |? Model -eq 'Clustered Windows Storage'
204220

205221
if (($ss | measure).count -ne 1) {
206-
write-host -fore red Expected single clustered storage subsystem, found:
222+
write-host -ForegroundColor Red Expected single clustered storage subsystem, found:
207223
$ss | ft -autosize
208224
return
209225
}
210226

211-
# pool health
227+
$ssuh = $ss |? HealthStatus -ne Healthy
212228

229+
if ($ssuh) {
230+
write-host -ForegroundColor Red WARNING: clustered storage subsystem is not healthy
231+
$ssuh | ft -AutoSize
232+
233+
write-host -ForegroundColor Red Output of Debug-StorageSubSystem follows
234+
$ssuh | Debug-StorageSubSystem
235+
} else {
236+
write-host -ForegroundColor Green Clustered storage subsystem Healthy
237+
}
238+
239+
# pool health
213240
$p = $ss | Get-StoragePool |? IsPrimordial -ne $true |? HealthStatus -ne Healthy
214241

215242
if ($p -eq $null) {
216-
write-host -fore green All operational pools Healthy
243+
write-host -ForegroundColor Green All pools Healthy
217244
} else {
218-
write-host -fore red Following pools not Healthy:
245+
write-host -ForegroundColor Red Following pools not Healthy:
219246
$p | ft -autosize
220247
}
221248
}
@@ -309,9 +336,9 @@ $j += start-job -name 'Operational Issues and Storage Jobs' -ArgumentList $Clean
309336
# Storage Jobs
310337

311338
$sj = get-storagejob
312-
if ($sj) {
339+
if ($sj |? JobState -ne Completed) {
313340
write-host -ForegroundColor red WARNING: there are active storage jobs running. Investigate the root cause before continuing.
314-
get-storagejob | ft -autosize
341+
$sj | ft -autosize
315342
} else {
316343
write-host -fore green No storage rebuild or regeneration jobs are active
317344
}
@@ -393,6 +420,9 @@ if ($roce) {
393420

394421
$j += start-job -name 'RoCE: Mellanox Error Check' {
395422

423+
$r = $null
424+
$pc = $null
425+
396426
switch ($using:drvdesc) {
397427
"Mellanox ConnectX-3 Pro Ethernet Adapter" {
398428
$pc = @{
@@ -401,8 +431,8 @@ if ($roce) {
401431
'\Mellanox Adapter Traffic Counters(_Total)\Packets Received Frame Length Error' = 'Rec FrmLenErr';
402432
'\Mellanox Adapter Traffic Counters(_Total)\Packets Received Symbol Error' = 'Rec SymlErr';
403433
'\Mellanox Adapter Traffic Counters(_Total)\Packets Received Discarded' = 'Rec Discard';
404-
'\Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Discarded' = 'Outbnd Err'
405-
'\Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Errors' = 'Outbnd Discard';
434+
'\Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Discarded' = 'Outbnd Discard'
435+
'\Mellanox Adapter Traffic Counters(_Total)\Packets Outbound Errors' = 'Outbnd Err';
406436
}
407437
}
408438
"Mellanox ConnectX-4 Adapter" {
@@ -412,30 +442,35 @@ if ($roce) {
412442
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Received Frame Length Error' = 'Rec FrmLenErr';
413443
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Received Symbol Error' = 'Rec SymlErr';
414444
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Received Discarded' = 'Rec Discard';
415-
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Discarded' = 'Outbnd Err'
416-
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Errors' = 'Outbnd Discard';
445+
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Discarded' = 'Outbnd Discard'
446+
'\Mellanox WinOF-2 Port Traffic(_Total)\Packets Outbound Errors' = 'Outbnd Err';
417447
}
418448
}
419449
default {
420450
write-host -ForegroundColor Red "Unknown adapter type: $($using:drvdesc)"
421451
}
422452
}
423453

424-
$r = icm (get-clusternode |? State -eq Up) -ArgumentList $pc {
454+
# no counters, no results
455+
456+
if ($pc -ne $null) {
457+
458+
$r = icm (get-clusternode |? State -eq Up) -ArgumentList $pc {
425459

426-
param( $pc )
460+
param($pc)
427461

428-
$c = get-counter ($pc.Keys |% { $_ }) -ErrorAction SilentlyContinue
429-
if ($c) {
462+
$c = get-counter ($pc.Keys |% { $_ }) -ErrorAction SilentlyContinue
463+
if ($c) {
430464

431-
$o = new-object psobject -Property @{ 'Errors' = $false }
432-
$c.CounterSamples | sort -Property Path |% {
433-
if ($_.path -match '\\\\[^\\]+(\\.*$)') {
434-
$o | Add-Member -NotePropertyName $pc[$matches[1]] -NotePropertyValue $_.CookedValue
435-
if ($_.CookedValue -ne 0) { $o.Errors = $true }
465+
$o = new-object psobject -Property @{ 'Errors' = $false }
466+
$c.CounterSamples | sort -Property Path |% {
467+
if ($_.path -match '\\\\[^\\]+(\\.*$)') {
468+
$o | Add-Member -NotePropertyName $pc[$matches[1]] -NotePropertyValue $_.CookedValue
469+
if ($_.CookedValue -ne 0) { $o.Errors = $true }
470+
}
436471
}
472+
$o
437473
}
438-
$o
439474
}
440475
}
441476

@@ -501,7 +536,7 @@ $f += ,(new-namedblock 'Selected & Non-Failed' { $_.Selected -and -not $_.Failed
501536
$j += start-job -InitializationScript $fns -Name $t.name { do-clustersymmetry $using:t $using:f }
502537

503538
###
504-
$t = new-namedblock 'SMB CSV Multichannel Symmetry Check' { Get-SmbMultichannelConnection -SmbInstance SBL }
539+
$t = new-namedblock 'SMB CSV Multichannel Symmetry Check' { Get-SmbMultichannelConnection -SmbInstance CSV }
505540
$f = @($totalf)
506541
$f += ,(new-namedblock 'RDMA Capable' { $_.ClientRdmaCapable -and $_.ServerRdmaCapable } -nullpass:$(-not $rdma))
507542
$f += ,(new-namedblock 'Selected & Non-Failed' { $_.Selected -and -not $_.Failed } -nullpass:$(-not $rdma))

0 commit comments

Comments
 (0)