Skip to content
This repository was archived by the owner on Jan 9, 2020. It is now read-only.

Commit 2a26ebd

Browse files
lins05ash211
authored andcommitted
Support setting the driver pod launching timeout. (#36)
* Support setting the driver pod launching timeout. And increase the default value from 30s to 60s. The current value of 30s is kind of short for pulling the image from public docker registry plus the container/JVM start time. * Use a better name for the default timeout.
1 parent 81875a6 commit 2a26ebd

File tree

1 file changed

+7
-5
lines changed
  • resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes

1 file changed

+7
-5
lines changed

resource-managers/kubernetes/core/src/main/scala/org/apache/spark/deploy/kubernetes/Client.scala

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@ private[spark] class Client(
6060
private val driverDockerImage = sparkConf.get(
6161
"spark.kubernetes.driver.docker.image", s"spark-driver:$SPARK_VERSION")
6262
private val uploadedJars = sparkConf.getOption("spark.kubernetes.driver.uploads.jars")
63+
private val driverLaunchTimeoutSecs = sparkConf.getTimeAsSeconds(
64+
"spark.kubernetes.driverLaunchTimeout", s"${DEFAULT_LAUNCH_TIMEOUT_SECONDS}s")
6365

6466
private val secretBase64String = {
6567
val secretBytes = new Array[Byte](128)
@@ -218,25 +220,25 @@ private[spark] class Client(
218220
.done()
219221
var submitSucceeded = false
220222
try {
221-
submitCompletedFuture.get(LAUNCH_TIMEOUT_SECONDS, TimeUnit.SECONDS)
223+
submitCompletedFuture.get(driverLaunchTimeoutSecs, TimeUnit.SECONDS)
222224
submitSucceeded = true
223225
} catch {
224226
case e: TimeoutException =>
225227
val driverPod = try {
226228
kubernetesClient.pods().withName(kubernetesAppId).get()
227229
} catch {
228230
case throwable: Throwable =>
229-
logError(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS seconds for the" +
231+
logError(s"Timed out while waiting $driverLaunchTimeoutSecs seconds for the" +
230232
" driver pod to start, but an error occurred while fetching the driver" +
231233
" pod's details.", throwable)
232-
throw new SparkException(s"Timed out while waiting $LAUNCH_TIMEOUT_SECONDS" +
234+
throw new SparkException(s"Timed out while waiting $driverLaunchTimeoutSecs" +
233235
" seconds for the driver pod to start. Unfortunately, in attempting to fetch" +
234236
" the latest state of the pod, another error was thrown. Check the logs for" +
235237
" the error that was thrown in looking up the driver pod.", e)
236238
}
237239
val topLevelMessage = s"The driver pod with name ${driverPod.getMetadata.getName}" +
238240
s" in namespace ${driverPod.getMetadata.getNamespace} was not ready in" +
239-
s" $LAUNCH_TIMEOUT_SECONDS seconds."
241+
s" $driverLaunchTimeoutSecs seconds."
240242
val podStatusPhase = if (driverPod.getStatus.getPhase != null) {
241243
s"Latest phase from the pod is: ${driverPod.getStatus.getPhase}"
242244
} else {
@@ -424,7 +426,7 @@ private[spark] object Client extends Logging {
424426
private val DRIVER_LAUNCHER_CONTAINER_NAME = "spark-kubernetes-driver-launcher"
425427
private val SECURE_RANDOM = new SecureRandom()
426428
private val SPARK_SUBMISSION_SECRET_BASE_DIR = "/var/run/secrets/spark-submission"
427-
private val LAUNCH_TIMEOUT_SECONDS = 30
429+
private val DEFAULT_LAUNCH_TIMEOUT_SECONDS = 60
428430
private val SPARK_APP_NAME_LABEL = "spark-app-name"
429431

430432
def main(args: Array[String]): Unit = {

0 commit comments

Comments
 (0)