Skip to content
Open
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions build/make-distribution.sh
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ function build_service {
# Store the command as an array because $MVN variable might have spaces in it.
# Normal quoting tricks don't work.
# See: http://mywiki.wooledge.org/BashFAQ/050
BUILD_COMMAND=("$MVN" clean package $MVN_DIST_OPT -pl master,worker,cli -am $@)
BUILD_COMMAND=("$MVN" clean package $MVN_DIST_OPT -pl master,worker,cli,lifecycle-manager -am $@)

# Actually build the jar
echo -e "\nBuilding with..."
Expand All @@ -158,6 +158,7 @@ function build_service {
mkdir -p "$DIST_DIR/master-jars"
mkdir -p "$DIST_DIR/worker-jars"
mkdir -p "$DIST_DIR/cli-jars"
mkdir -p "$DIST_DIR/lifecycle-manager-jars"

## Copy master jars
cp "$PROJECT_DIR"/master/target/celeborn-master_$SCALA_VERSION-$VERSION.jar "$DIST_DIR/master-jars/"
Expand All @@ -177,6 +178,21 @@ function build_service {
for jar in $(ls "$PROJECT_DIR/cli/target/scala-$SCALA_VERSION/jars"); do
(cd $DIST_DIR/cli-jars; ln -snf "../jars/$jar" .)
done
## Copy lifecycle-manager jars
# lifecycle-manager depends on celeborn-client which is not a dependency of master/worker,
# so we copy its project-internal dependency jars that are missing from jars/.
for module_jar in \
"$PROJECT_DIR/lifecycle-manager/target/celeborn-lifecycle-manager_$SCALA_VERSION-$VERSION.jar" \
"$PROJECT_DIR/client/target/celeborn-client_$SCALA_VERSION-$VERSION.jar"; do
jarname=$(basename "$module_jar")
if [ ! -f "$DIST_DIR/jars/$jarname" ]; then
cp "$module_jar" "$DIST_DIR/jars/"
fi
done
cp "$PROJECT_DIR"/lifecycle-manager/target/celeborn-lifecycle-manager_$SCALA_VERSION-$VERSION.jar "$DIST_DIR/lifecycle-manager-jars/"
for jar in $(ls "$DIST_DIR/jars"); do
(cd $DIST_DIR/lifecycle-manager-jars; ln -snf "../jars/$jar" .)
done
}

function build_spark_client {
Expand Down Expand Up @@ -304,12 +320,13 @@ function sbt_build_service {

"${BUILD_COMMAND[@]}"

$SBT "celeborn-master/copyJars;celeborn-worker/copyJars;celeborn-cli/copyJars"
$SBT "celeborn-master/copyJars;celeborn-worker/copyJars;celeborn-cli/copyJars;celeborn-lifecycle-manager/copyJars"

mkdir -p "$DIST_DIR/jars"
mkdir -p "$DIST_DIR/master-jars"
mkdir -p "$DIST_DIR/worker-jars"
mkdir -p "$DIST_DIR/cli-jars"
mkdir -p "$DIST_DIR/lifecycle-manager-jars"

## Copy master jars
cp "$PROJECT_DIR"/master/target/scala-$SCALA_VERSION/celeborn-master_$SCALA_VERSION-$VERSION.jar "$DIST_DIR/master-jars/"
Expand All @@ -329,6 +346,12 @@ function sbt_build_service {
for jar in $(ls "$PROJECT_DIR/cli/target/scala-$SCALA_VERSION/jars"); do
(cd $DIST_DIR/cli-jars; ln -snf "../jars/$jar" .)
done
## Copy lifecycle-manager jars
cp "$PROJECT_DIR"/lifecycle-manager/target/scala-$SCALA_VERSION/celeborn-lifecycle-manager_$SCALA_VERSION-$VERSION.jar "$DIST_DIR/lifecycle-manager-jars/"
cp "$PROJECT_DIR"/lifecycle-manager/target/scala-$SCALA_VERSION/jars/*.jar "$DIST_DIR/jars/"
for jar in $(ls "$PROJECT_DIR/lifecycle-manager/target/scala-$SCALA_VERSION/jars"); do
(cd $DIST_DIR/lifecycle-manager-jars; ln -snf "../jars/$jar" .)
done
}

function sbt_build_client {
Expand Down
68 changes: 68 additions & 0 deletions lifecycle-manager/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
~ Licensed to the Apache Software Foundation (ASF) under one or more
~ contributor license agreements. See the NOTICE file distributed with
~ this work for additional information regarding copyright ownership.
~ The ASF licenses this file to You under the Apache License, Version 2.0
~ (the "License"); you may not use this file except in compliance with
~ the License. You may obtain a copy of the License at
~
~ http://www.apache.org/licenses/LICENSE-2.0
~
~ Unless required by applicable law or agreed to in writing, software
~ distributed under the License is distributed on an "AS IS" BASIS,
~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~ See the License for the specific language governing permissions and
~ limitations under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.apache.celeborn</groupId>
<artifactId>celeborn-parent_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<relativePath>../pom.xml</relativePath>
</parent>

<artifactId>celeborn-lifecycle-manager_${scala.binary.version}</artifactId>
<packaging>jar</packaging>
<name>Celeborn Lifecycle Manager</name>

<dependencies>
<dependency>
<groupId>org.apache.celeborn</groupId>
<artifactId>celeborn-service_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.celeborn</groupId>
<artifactId>celeborn-client_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.celeborn</groupId>
<artifactId>celeborn-common_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Test dependencies -->
<dependency>
<groupId>org.apache.celeborn</groupId>
<artifactId>celeborn-common_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<type>test-jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.scalatest</groupId>
<artifactId>scalatest_${scala.binary.version}</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.celeborn.server.lifecyclemanager

import java.util.concurrent.CountDownLatch
import java.util.concurrent.atomic.AtomicReference

import org.apache.celeborn.client.LifecycleManager
import org.apache.celeborn.common.CelebornConf
import org.apache.celeborn.common.internal.Logging
import org.apache.celeborn.common.util.{SignalUtils, Utils}

object LifecycleManagerDaemon extends Logging {

private[lifecyclemanager] val shutdownLatch: CountDownLatch = new CountDownLatch(1)

private[lifecyclemanager] val currentInstance: AtomicReference[LifecycleManager] =
new AtomicReference[LifecycleManager]()

private[lifecyclemanager] var exitFn: Int => Unit =
(code: Int) => System.exit(code)

def main(args: Array[String]): Unit = {
SignalUtils.registerLogger(log)

val parsedArgs = LifecycleManagerDaemonArguments.parse(args)
val conf = new CelebornConf()

// Load properties file before applying CLI args
Utils.loadDefaultCelebornProperties(conf, parsedArgs.propertiesFile.orNull)

applyArgsToConf(parsedArgs, conf)

// Auth check: standalone LM does not support auth (cpp/Rust client lacks SASL)
if (conf.authEnabledOnClient) {
logError(
"Standalone LifecycleManager does not support auth " +
"(cpp/Rust client lacks SASL); set celeborn.auth.enabled=false")
exitFn(1)
return
}

// Propagate --host to Utils so LifecycleManager binds to the requested hostname
parsedArgs.host.foreach { host =>
logInfo(s"Setting custom hostname from --host: $host")
Utils.setCustomHostname(host)
}

logInfo(s"Parsed args: appId=${parsedArgs.appId}, port=${parsedArgs.port}, " +
s"masterEndpoints=${parsedArgs.masterEndpoints}")

try {
val lm = new LifecycleManager(parsedArgs.appId, conf)
currentInstance.set(lm)

installShutdownHook(conf)

// scalastyle:off println
println(s"LifecycleManager bound at ${lm.getHost}:${lm.getPort}")
// scalastyle:on println

logInfo("shutdown hook installed; press Ctrl-C to stop.")

shutdownLatch.await()
exitFn(0)
} catch {
case e: Exception =>
logError("Failed to start LifecycleManager", e)
exitFn(1)
}
}

private[lifecyclemanager] def runUntilStopped(lm: LifecycleManager): Unit = {
currentInstance.set(lm)
shutdownLatch.await()
}

private[lifecyclemanager] def applyArgsToConf(
args: LifecycleManagerDaemonArguments,
conf: CelebornConf): Unit = {
conf.set(CelebornConf.MASTER_ENDPOINTS.key, args.masterEndpoints)
conf.set(CelebornConf.CLIENT_SHUFFLE_MANAGER_PORT.key, args.port.toString)
}

private def installShutdownHook(conf: CelebornConf): Unit = {
val shutdownTimeoutMs = conf.appHeartbeatTimeoutMs / 2

// Watchdog: force halt if shutdown takes too long
val watchdog = new Thread("celeborn-lm-shutdown-watchdog") {
override def run(): Unit = {
try {
Thread.sleep(shutdownTimeoutMs)
logError(s"Shutdown exceeded ${shutdownTimeoutMs}ms, forcing halt")
Runtime.getRuntime.halt(2)
} catch {
case _: InterruptedException => // normal exit, watchdog no longer needed
}
}
}
watchdog.setDaemon(true)

Runtime.getRuntime.addShutdownHook(new Thread("celeborn-lm-shutdown") {
override def run(): Unit = {
watchdog.start()
val lm = currentInstance.get()
if (lm != null) {
try {
lm.stop()
} catch {
case t: Throwable => logError("lm.stop() failed", t)
}
}
shutdownLatch.countDown()
}
})
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.celeborn.server.lifecyclemanager

import scala.annotation.tailrec

import org.apache.celeborn.common.util.IntParam

private[lifecyclemanager] case class LifecycleManagerDaemonArguments(
appId: String,
masterEndpoints: String,
port: Int,
host: Option[String],
propertiesFile: Option[String])

private[lifecyclemanager] object LifecycleManagerDaemonArguments {

private val MIN_USER_PORT = 1024

def parse(args: Array[String]): LifecycleManagerDaemonArguments = {
var appId: Option[String] = None
var masterEndpoints: Option[String] = None
var port: Option[Int] = None
var host: Option[String] = None
var propertiesFile: Option[String] = None

@tailrec
def doParse(remaining: List[String]): Unit = remaining match {
case "--app-id" :: value :: tail =>
appId = Some(value)
doParse(tail)

case "--master-endpoints" :: value :: tail =>
masterEndpoints = Some(value)
doParse(tail)

case ("--port" | "-p") :: IntParam(value) :: tail =>
port = Some(value)
doParse(tail)

case ("--host" | "-h") :: value :: tail =>
host = Some(value)
doParse(tail)

case "--properties-file" :: value :: tail =>
propertiesFile = Some(value)
doParse(tail)

case "--help" :: _ =>
// scalastyle:off println
System.err.println(usage)
// scalastyle:on println
sys.exit(0)

case Nil => // done

case unknown :: _ =>
// scalastyle:off println
System.err.println(s"Unknown argument: $unknown")
System.err.println(usage)
// scalastyle:on println
sys.exit(1)
}

doParse(args.toList)

if (appId.isEmpty) {
// scalastyle:off println
System.err.println("Error: --app-id is required.")
System.err.println(usage)
// scalastyle:on println
sys.exit(1)
}
if (masterEndpoints.isEmpty) {
// scalastyle:off println
System.err.println("Error: --master-endpoints is required.")
System.err.println(usage)
// scalastyle:on println
sys.exit(1)
}
if (port.isEmpty) {
// scalastyle:off println
System.err.println("Error: --port is required.")
System.err.println(usage)
// scalastyle:on println
sys.exit(1)
}
if (port.get < MIN_USER_PORT) {
// scalastyle:off println
System.err.println(s"Error: --port must be >= $MIN_USER_PORT, got ${port.get}.")
System.err.println(usage)
// scalastyle:on println
sys.exit(1)
}

LifecycleManagerDaemonArguments(
appId = appId.get,
masterEndpoints = masterEndpoints.get,
port = port.get,
host = host,
propertiesFile = propertiesFile)
}

val usage: String =
"""Usage: LifecycleManagerDaemon [options]
|
|Options:
| --app-id ID Application unique identifier (required)
| --master-endpoints ENDPOINTS Comma-separated master host:port list (required)
| -p PORT, --port PORT Port for LifecycleManager to listen on (required, >= 1024)
| -h HOST, --host HOST Hostname to bind (optional, default: auto-detect)
| --properties-file FILE Path to a custom Celeborn properties file,
| default is conf/celeborn-defaults.conf
|""".stripMargin
}
Loading
Loading