Skip to content

Commit 76f92ed

Browse files
authored
More retry (#236)
* Add log when deploying before listing editors * Fix timer not canceling It seems to operate in its own little world and I have no idea how to make it stop when the job running it has stopped. * Retry direct connection This will cover recent connections which connect directly without going through the whole setup flow. Pretty much the same logic as for listing editors but we display the errors in different ways since this all happens in a progress dialog. I tried to combine what I could in the retry. Also the SshException is misleading; it seems to wrap the real error so unwrap it otherwise it is impossible to tell what is really wrong. In particular this is causing us to retry on cancelations. * Provide better error when dd times out
1 parent 6f1a610 commit 76f92ed

File tree

4 files changed

+178
-68
lines changed

4 files changed

+178
-68
lines changed

src/main/kotlin/com/coder/gateway/CoderGatewayConnectionProvider.kt

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,29 @@
22

33
package com.coder.gateway
44

5+
import com.coder.gateway.sdk.humanizeDuration
6+
import com.coder.gateway.sdk.isCancellation
7+
import com.coder.gateway.sdk.isWorkerTimeout
8+
import com.coder.gateway.sdk.suspendingRetryWithExponentialBackOff
59
import com.coder.gateway.services.CoderRecentWorkspaceConnectionsService
10+
import com.intellij.openapi.application.ApplicationManager
611
import com.intellij.openapi.components.service
712
import com.intellij.openapi.diagnostic.Logger
813
import com.intellij.openapi.rd.util.launchUnderBackgroundProgress
14+
import com.intellij.openapi.ui.Messages
915
import com.jetbrains.gateway.api.ConnectionRequestor
1016
import com.jetbrains.gateway.api.GatewayConnectionHandle
1117
import com.jetbrains.gateway.api.GatewayConnectionProvider
1218
import com.jetbrains.gateway.api.GatewayUI
1319
import com.jetbrains.gateway.ssh.SshDeployFlowUtil
1420
import com.jetbrains.gateway.ssh.SshMultistagePanelContext
21+
import com.jetbrains.gateway.ssh.deploy.DeployException
1522
import com.jetbrains.rd.util.lifetime.LifetimeDefinition
1623
import kotlinx.coroutines.launch
24+
import net.schmizz.sshj.common.SSHException
25+
import net.schmizz.sshj.connection.ConnectionException
1726
import java.time.Duration
27+
import java.util.concurrent.TimeoutException
1828

1929
class CoderGatewayConnectionProvider : GatewayConnectionProvider {
2030
private val recentConnectionsService = service<CoderRecentWorkspaceConnectionsService>()
@@ -24,12 +34,53 @@ class CoderGatewayConnectionProvider : GatewayConnectionProvider {
2434
// TODO: If this fails determine if it is an auth error and if so prompt
2535
// for a new token, configure the CLI, then try again.
2636
clientLifetime.launchUnderBackgroundProgress(CoderGatewayBundle.message("gateway.connector.coder.connection.provider.title"), canBeCancelled = true, isIndeterminate = true, project = null) {
27-
val context = SshMultistagePanelContext(parameters.toHostDeployInputs())
28-
logger.info("Deploying and starting IDE with $context")
29-
launch {
30-
@Suppress("UnstableApiUsage") SshDeployFlowUtil.fullDeployCycle(
31-
clientLifetime, context, Duration.ofMinutes(10)
37+
try {
38+
indicator.text = CoderGatewayBundle.message("gateway.connector.coder.connecting")
39+
val context = suspendingRetryWithExponentialBackOff(
40+
action = { attempt ->
41+
logger.info("Connecting... (attempt $attempt")
42+
if (attempt > 1) {
43+
// indicator.text is the text above the progress bar.
44+
indicator.text = CoderGatewayBundle.message("gateway.connector.coder.connecting.retry", attempt)
45+
}
46+
SshMultistagePanelContext(parameters.toHostDeployInputs())
47+
},
48+
retryIf = {
49+
it is ConnectionException || it is TimeoutException
50+
|| it is SSHException || it is DeployException
51+
},
52+
onException = { attempt, nextMs, e ->
53+
logger.error("Failed to connect (attempt $attempt; will retry in $nextMs ms)")
54+
// indicator.text2 is the text below the progress bar.
55+
indicator.text2 =
56+
if (isWorkerTimeout(e)) "Failed to upload worker binary...it may have timed out"
57+
else e.message ?: CoderGatewayBundle.message("gateway.connector.no-details")
58+
},
59+
onCountdown = { remainingMs ->
60+
indicator.text = CoderGatewayBundle.message("gateway.connector.coder.connecting.failed.retry", humanizeDuration(remainingMs))
61+
},
3262
)
63+
launch {
64+
logger.info("Deploying and starting IDE with $context")
65+
// At this point JetBrains takes over with their own UI.
66+
@Suppress("UnstableApiUsage") SshDeployFlowUtil.fullDeployCycle(
67+
clientLifetime, context, Duration.ofMinutes(10)
68+
)
69+
}
70+
} catch (e: Exception) {
71+
if (isCancellation(e)) {
72+
logger.info("Connection canceled due to ${e.javaClass}")
73+
} else {
74+
logger.info("Failed to connect (will not retry)", e)
75+
// The dialog will close once we return so write the error
76+
// out into a new dialog.
77+
ApplicationManager.getApplication().invokeAndWait {
78+
Messages.showMessageDialog(
79+
e.message ?: CoderGatewayBundle.message("gateway.connector.no-details"),
80+
CoderGatewayBundle.message("gateway.connector.coder.connection.failed"),
81+
Messages.getErrorIcon())
82+
}
83+
}
3384
}
3485
}
3586

Lines changed: 77 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,50 @@
11
package com.coder.gateway.sdk
22

3+
import com.intellij.openapi.progress.ProcessCanceledException
4+
import com.intellij.ssh.SshException
5+
import com.jetbrains.gateway.ssh.deploy.DeployException
36
import kotlinx.coroutines.delay
4-
import kotlinx.datetime.Clock
57
import java.util.Random
68
import java.util.concurrent.TimeUnit
7-
import kotlin.concurrent.timer
8-
import kotlin.math.max
9+
import kotlin.coroutines.cancellation.CancellationException
910
import kotlin.math.min
1011

12+
fun unwrap(ex: Exception): Throwable {
13+
var cause = ex.cause
14+
while(cause?.cause != null) {
15+
cause = cause.cause
16+
}
17+
return cause ?: ex
18+
}
19+
1120
/**
12-
* Similar to Intellij's except it gives you the next delay, does not do its own
13-
* logging, updates periodically (for counting down), and runs forever.
21+
* Similar to Intellij's except it adds two new arguments: onCountdown (for
22+
* displaying the time until the next try) and retryIf (to limit which
23+
* exceptions can be retried).
24+
*
25+
* Exceptions that cannot be retried will be thrown.
26+
*
27+
* onException and onCountdown will be called immediately on retryable failures.
28+
* onCountdown will also be called every second until the next try with the time
29+
* left until that next try (the last interval might be less than one second if
30+
* the total delay is not divisible by one second).
31+
*
32+
* Some other differences:
33+
* - onException gives you the time until the next try (intended to be logged
34+
* with the error).
35+
* - Infinite tries.
36+
* - SshException is unwrapped.
37+
*
38+
* It is otherwise identical.
1439
*/
1540
suspend fun <T> suspendingRetryWithExponentialBackOff(
1641
initialDelayMs: Long = TimeUnit.SECONDS.toMillis(5),
1742
backOffLimitMs: Long = TimeUnit.MINUTES.toMillis(3),
1843
backOffFactor: Int = 2,
1944
backOffJitter: Double = 0.1,
20-
update: (attempt: Int, remainingMs: Long, e: Exception) -> Unit,
45+
retryIf: (e: Throwable) -> Boolean,
46+
onException: (attempt: Int, nextMs: Long, e: Throwable) -> Unit,
47+
onCountdown: (remaining: Long) -> Unit,
2148
action: suspend (attempt: Int) -> T
2249
): T {
2350
val random = Random()
@@ -26,21 +53,53 @@ suspend fun <T> suspendingRetryWithExponentialBackOff(
2653
try {
2754
return action(attempt)
2855
}
29-
catch (e: Exception) {
30-
val end = Clock.System.now().toEpochMilliseconds() + delayMs
31-
val timer = timer(period = TimeUnit.SECONDS.toMillis(1)) {
32-
val now = Clock.System.now().toEpochMilliseconds()
33-
val next = max(end - now, 0)
34-
if (next > 0) {
35-
update(attempt, next, e)
36-
} else {
37-
this.cancel()
38-
}
56+
catch (originalEx: Exception) {
57+
// SshException can happen due to anything from a timeout to being
58+
// canceled so unwrap to find out.
59+
val unwrappedEx = if (originalEx is SshException) unwrap(originalEx) else originalEx
60+
if (!retryIf(unwrappedEx)) {
61+
throw unwrappedEx
62+
}
63+
onException(attempt, delayMs, unwrappedEx)
64+
var remainingMs = delayMs
65+
while (remainingMs > 0) {
66+
onCountdown(remainingMs)
67+
val next = min(remainingMs, TimeUnit.SECONDS.toMillis(1))
68+
remainingMs -= next
69+
delay(next)
3970
}
40-
delay(delayMs)
41-
timer.cancel()
4271
delayMs = min(delayMs * backOffFactor, backOffLimitMs) + (random.nextGaussian() * delayMs * backOffJitter).toLong()
4372
}
4473
}
4574
error("Should never be reached")
4675
}
76+
77+
/**
78+
* Convert a millisecond duration into a human-readable string.
79+
*
80+
* < 1 second: "now"
81+
* 1 second: "in one second"
82+
* > 1 second: "in <duration> seconds"
83+
*/
84+
fun humanizeDuration(durationMs: Long): String {
85+
val seconds = TimeUnit.MILLISECONDS.toSeconds(durationMs)
86+
return if (seconds < 1) "now" else "in $seconds second${if (seconds > 1) "s" else ""}"
87+
}
88+
89+
/**
90+
* When the worker upload times out Gateway just says it failed. Even the root
91+
* cause (IllegalStateException) is useless. The error also includes a very
92+
* long useless tmp path. Return true if the error looks like this timeout.
93+
*/
94+
fun isWorkerTimeout(e: Throwable): Boolean {
95+
return e is DeployException && e.message.contains("Worker binary deploy failed")
96+
}
97+
98+
/**
99+
* Return true if the exception is some kind of cancellation.
100+
*/
101+
fun isCancellation(e: Throwable): Boolean {
102+
return e is InterruptedException
103+
|| e is CancellationException
104+
|| e is ProcessCanceledException
105+
}

src/main/kotlin/com/coder/gateway/views/steps/CoderLocateRemoteProjectStepView.kt

Lines changed: 36 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ import com.coder.gateway.sdk.Arch
88
import com.coder.gateway.sdk.CoderCLIManager
99
import com.coder.gateway.sdk.CoderRestClientService
1010
import com.coder.gateway.sdk.OS
11+
import com.coder.gateway.sdk.humanizeDuration
12+
import com.coder.gateway.sdk.isCancellation
13+
import com.coder.gateway.sdk.isWorkerTimeout
1114
import com.coder.gateway.sdk.suspendingRetryWithExponentialBackOff
1215
import com.coder.gateway.sdk.toURL
1316
import com.coder.gateway.sdk.withPath
@@ -68,7 +71,6 @@ import net.schmizz.sshj.connection.ConnectionException
6871
import java.awt.Component
6972
import java.awt.FlowLayout
7073
import java.util.Locale
71-
import java.util.concurrent.TimeUnit
7274
import java.util.concurrent.TimeoutException
7375
import javax.swing.ComboBoxModel
7476
import javax.swing.DefaultComboBoxModel
@@ -79,7 +81,6 @@ import javax.swing.JPanel
7981
import javax.swing.ListCellRenderer
8082
import javax.swing.SwingConstants
8183
import javax.swing.event.DocumentEvent
82-
import kotlin.coroutines.cancellation.CancellationException
8384

8485
class CoderLocateRemoteProjectStepView(private val setNextButtonEnabled: (Boolean) -> Unit) : CoderWorkspacesWizardStep, Disposable {
8586
private val cs = CoroutineScope(Dispatchers.Main)
@@ -162,6 +163,7 @@ class CoderLocateRemoteProjectStepView(private val setNextButtonEnabled: (Boolea
162163
// Clear contents from the last attempt if any.
163164
cbIDEComment.foreground = UIUtil.getContextHelpForeground()
164165
cbIDEComment.text = CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.ide.none.comment")
166+
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve-ides"))
165167
ideComboBoxModel.removeAllElements()
166168
setNextButtonEnabled(false)
167169

@@ -178,54 +180,47 @@ class CoderLocateRemoteProjectStepView(private val setNextButtonEnabled: (Boolea
178180
terminalLink.url = coderClient.coderURL.withPath("/@${coderClient.me.username}/${selectedWorkspace.name}/terminal").toString()
179181

180182
ideResolvingJob = cs.launch {
181-
val ides = suspendingRetryWithExponentialBackOff(
182-
action={ attempt ->
183-
// Reset text in the select dropdown.
184-
withContext(Dispatchers.Main) {
185-
cbIDE.renderer = IDECellRenderer(
186-
if (attempt > 1) CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.retry.text", attempt)
187-
else CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.loading.text"))
188-
}
189-
try {
183+
try {
184+
val ides = suspendingRetryWithExponentialBackOff(
185+
action = { attempt ->
186+
logger.info("Retrieving IDEs...(attempt $attempt)")
187+
if (attempt > 1) {
188+
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve.ides.retry", attempt))
189+
}
190190
val executor = createRemoteExecutor(CoderCLIManager.getHostName(deploymentURL, selectedWorkspace))
191191
if (ComponentValidator.getInstance(tfProject).isEmpty) {
192192
installRemotePathValidator(executor)
193193
}
194194
retrieveIDEs(executor, selectedWorkspace)
195-
} catch (e: Exception) {
196-
when(e) {
197-
is InterruptedException -> Unit
198-
is CancellationException -> Unit
199-
// Throw to retry these. The main one is
200-
// DeployException which fires when dd times out.
201-
is ConnectionException, is TimeoutException,
202-
is SSHException, is DeployException -> throw e
203-
else -> {
204-
withContext(Dispatchers.Main) {
205-
logger.error("Failed to retrieve IDEs (attempt $attempt)", e)
206-
cbIDEComment.foreground = UIUtil.getErrorForeground()
207-
cbIDEComment.text = e.message ?: "The error did not provide any further details"
208-
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.error.text"), UIUtil.getBalloonErrorIcon())
209-
}
210-
}
211-
}
212-
null
213-
}
214-
},
215-
update = { attempt, retryMs, e ->
216-
logger.error("Failed to retrieve IDEs (attempt $attempt; will retry in $retryMs ms)", e)
217-
cbIDEComment.foreground = UIUtil.getErrorForeground()
218-
cbIDEComment.text = e.message ?: "The error did not provide any further details"
219-
val delayS = TimeUnit.MILLISECONDS.toSeconds(retryMs)
220-
val delay = if (delayS < 1) "now" else "in $delayS second${if (delayS > 1) "s" else ""}"
221-
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.remoteproject.retry-error.text", delay))
222-
},
223-
)
224-
if (ides != null) {
195+
},
196+
retryIf = {
197+
it is ConnectionException || it is TimeoutException
198+
|| it is SSHException || it is DeployException
199+
},
200+
onException = { attempt, nextMs, e ->
201+
logger.error("Failed to retrieve IDEs (attempt $attempt; will retry in $nextMs ms)")
202+
cbIDEComment.foreground = UIUtil.getErrorForeground()
203+
cbIDEComment.text =
204+
if (isWorkerTimeout(e)) "Failed to upload worker binary...it may have timed out. Check the command log for more details."
205+
else e.message ?: CoderGatewayBundle.message("gateway.connector.no-details")
206+
},
207+
onCountdown = { remainingMs ->
208+
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve-ides.failed.retry", humanizeDuration(remainingMs)))
209+
},
210+
)
225211
withContext(Dispatchers.Main) {
226212
ideComboBoxModel.addAll(ides)
227213
cbIDE.selectedIndex = 0
228214
}
215+
} catch (e: Exception) {
216+
if (isCancellation(e)) {
217+
logger.info("Connection canceled due to ${e.javaClass}")
218+
} else {
219+
logger.error("Failed to retrieve IDEs (will not retry)", e)
220+
cbIDEComment.foreground = UIUtil.getErrorForeground()
221+
cbIDEComment.text = e.message ?: CoderGatewayBundle.message("gateway.connector.no-details")
222+
cbIDE.renderer = IDECellRenderer(CoderGatewayBundle.message("gateway.connector.view.coder.retrieve-ides.failed"), UIUtil.getBalloonErrorIcon())
223+
}
229224
}
230225
}
231226
}

src/main/resources/messages/CoderGatewayBundle.properties

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ gateway.connector.view.workspaces.token.comment=The last used token is shown abo
2828
gateway.connector.view.workspaces.token.rejected=This token was rejected.
2929
gateway.connector.view.workspaces.token.injected=This token was pulled from your CLI config.
3030
gateway.connector.view.workspaces.token.none=No existing token found.
31-
gateway.connector.view.coder.remoteproject.loading.text=Retrieving products...
32-
gateway.connector.view.coder.remoteproject.retry.text=Retrieving products (attempt {0})...
33-
gateway.connector.view.coder.remoteproject.error.text=Failed to retrieve IDEs
34-
gateway.connector.view.coder.remoteproject.retry-error.text=Failed to retrieve IDEs...retrying {0}
31+
gateway.connector.view.coder.retrieve-ides=Retrieving IDEs...
32+
gateway.connector.view.coder.retrieve.ides.retry=Retrieving IDEs (attempt {0})...
33+
gateway.connector.view.coder.retrieve-ides.failed=Failed to retrieve IDEs
34+
gateway.connector.view.coder.retrieve-ides.failed.retry=Failed to retrieve IDEs...retrying {0}
3535
gateway.connector.view.coder.remoteproject.next.text=Start IDE and connect
3636
gateway.connector.view.coder.remoteproject.choose.text=Choose IDE and project for workspace {0}
3737
gateway.connector.view.coder.remoteproject.ide.download.comment=This IDE will be downloaded from jetbrains.com and installed to the default path on the remote host.
@@ -42,6 +42,10 @@ gateway.connector.recentconnections.new.wizard.button.tooltip=Open a new Coder W
4242
gateway.connector.recentconnections.remove.button.tooltip=Remove from Recent Connections
4343
gateway.connector.recentconnections.terminal.button.tooltip=Open SSH Web Terminal
4444
gateway.connector.coder.connection.provider.title=Connecting to Coder workspace...
45+
gateway.connector.coder.connecting=Connecting...
46+
gateway.connector.coder.connecting.retry=Connecting (attempt {0})...
47+
gateway.connector.coder.connection.failed=Failed to connect
48+
gateway.connector.coder.connecting.failed.retry=Failed to connect...retrying {0}
4549
gateway.connector.settings.binary-source.title=CLI source:
4650
gateway.connector.settings.binary-source.comment=Used to download the Coder \
4751
CLI which is necessary to make SSH connections. The If-None-Matched header \
@@ -54,3 +58,4 @@ gateway.connector.settings.binary-destination.comment=Directories are created \
5458
here that store the CLI and credentials for each domain to which the plugin \
5559
connects. \
5660
Defaults to {0}.
61+
gateway.connector.no-details="The error did not provide any further details"

0 commit comments

Comments
 (0)