diff --git a/docs/index.rst b/docs/index.rst index 1c757f8c13..1f78ccdd2a 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -84,6 +84,7 @@ NVIDIA FLARE Deploy Prepare Running FLARE in Docker Running FLARE in Kubernetes + Deploying FLARE on OpenShift Brev Scripted Deployment Quickstart Brev Kubernetes Helm Deployment Preflight Check diff --git a/docs/user_guide/admin_guide/deployment/brev_deployment.rst b/docs/user_guide/admin_guide/deployment/brev_deployment.rst index bf44a6f92b..29afa76153 100644 --- a/docs/user_guide/admin_guide/deployment/brev_deployment.rst +++ b/docs/user_guide/admin_guide/deployment/brev_deployment.rst @@ -302,11 +302,15 @@ Dockerfile, install the dependency in the image: .. code-block:: dockerfile - RUN pip install kubernetes + RUN pip install "kubernetes!=36.0.0" The repository ``docker/Dockerfile.parent`` already installs the NVFlare ``K8S`` extra, which includes this dependency. Keep that install line, or add -the explicit ``pip install kubernetes`` line above before building your image. +the explicit ``pip install kubernetes!=36.0.0`` line above before building your image. + +The prepared Brev launcher uses in-cluster Kubernetes config +(``job_launcher.config_file_path: null``), so the parent pod authenticates with +its ServiceAccount token. .. code-block:: shell @@ -576,7 +580,9 @@ in that namespace. Copy the prepared server ``startup/`` and ``local/`` directories into the ``nvflws`` PVC. The chart starts the server with ``-m /var/tmp/nvflare/workspace``, so the PVC root must contain ``startup/`` -and ``local/`` directly. +and ``local/`` directly. The temporary copy pod image must contain ``tar`` +because ``kubectl cp`` requires it in the target container; ``busybox:1.36`` +includes ``tar``. .. code-block:: shell @@ -701,7 +707,9 @@ same launcher settings from ``/tmp/nvflare-k8s.yaml``. Keep the Helm namespace consistent with the ``namespace`` value used by ``nvflare deploy prepare``. Copy the prepared ``site-1`` ``startup/`` and ``local/`` directories into the -client ``nvflws`` PVC: +client ``nvflws`` PVC. The temporary copy pod image must contain ``tar`` +because ``kubectl cp`` requires it in the target container; ``busybox:1.36`` +includes ``tar``: .. code-block:: shell diff --git a/docs/user_guide/admin_guide/deployment/brev_scripts/launch_brev_nvflare.sh b/docs/user_guide/admin_guide/deployment/brev_scripts/launch_brev_nvflare.sh index 44098d9c01..7cf2c337b1 100755 --- a/docs/user_guide/admin_guide/deployment/brev_scripts/launch_brev_nvflare.sh +++ b/docs/user_guide/admin_guide/deployment/brev_scripts/launch_brev_nvflare.sh @@ -94,6 +94,11 @@ if prepared_namespace != namespace: f"Prepared launcher namespace is {prepared_namespace!r}, but launch NAMESPACE is {namespace!r}. " "Use the same NAMESPACE for prepare and launch." ) +config_file_path = args.get("config_file_path") +if config_file_path not in (None, ""): + raise SystemExit( + f"Prepared launcher config_file_path is {config_file_path!r}; expected null/empty for in-cluster config." + ) if not args.get("workspace_mount_path"): raise SystemExit("k8s_launcher args missing workspace_mount_path") @@ -197,6 +202,7 @@ spec: restartPolicy: Never containers: - name: copy + # kubectl cp requires tar in the target container; busybox includes it. image: busybox:1.36 command: - sh @@ -298,6 +304,14 @@ install_chart() { helm "${helm_args[@]}" } +verify_parent_kubernetes_client() { + kubectl -n "${NAMESPACE}" exec "deploy/${PARTICIPANT}" -- "${PARENT_PYTHON_PATH}" -c ' +import kubernetes + +print(f"kubernetes-python-client={kubernetes.__version__}") +' +} + if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then usage exit 0 @@ -318,6 +332,7 @@ ARCHIVE="${ARCHIVE:-${HOME}/nvflare-${PARTICIPANT}.tgz}" COPY_POD="${COPY_POD:-nvflare-pvc-copy}" ROLLOUT_TIMEOUT="${ROLLOUT_TIMEOUT:-300s}" LOG_TAIL="${LOG_TAIL:-100}" +PARENT_PYTHON_PATH="${PARENT_PYTHON_PATH:-/usr/local/bin/python3}" require_cmd kubectl helm tar python3 [[ -f "${ARCHIVE}" ]] || fail "Archive not found: ${ARCHIVE}" @@ -361,6 +376,7 @@ fi install_chart kubectl -n "${NAMESPACE}" rollout status "deployment/${PARTICIPANT}" --timeout="${ROLLOUT_TIMEOUT}" +verify_parent_kubernetes_client kubectl -n "${NAMESPACE}" get pods kubectl -n "${NAMESPACE}" logs "deploy/${PARTICIPANT}" --tail="${LOG_TAIL}" || true diff --git a/docs/user_guide/admin_guide/deployment/helm_chart.rst b/docs/user_guide/admin_guide/deployment/helm_chart.rst index d7535794fc..ca5c16e707 100644 --- a/docs/user_guide/admin_guide/deployment/helm_chart.rst +++ b/docs/user_guide/admin_guide/deployment/helm_chart.rst @@ -13,8 +13,8 @@ kits and then preparing each server or client kit for the Kubernetes runtime. The prepared kit contains a participant-specific Helm chart plus the ``startup/`` and ``local/`` folders that must be staged into Kubernetes storage. -For example scripts that automate temporary Kubernetes and managed cloud cluster -testing flows, see +For example scripts that automate temporary Kubernetes, OpenShift, and managed +cloud cluster testing flows, see :github_nvflare_link:`examples/devops `. These scripts are for development, smoke testing, demos, and learning only; they are not production deployment guidance. @@ -28,6 +28,9 @@ Before you start, make sure you have: ``nvflare deploy prepare``. * ``kubectl`` configured for the target cluster. Use a ``kubectl`` version that is compatible with the Kubernetes API server. +* ``tar`` installed locally and in any temporary pod image used with + ``kubectl cp``. The staging examples below use ``busybox:1.36``, which + includes ``tar``. * Helm 3. * A Kubernetes cluster with standard ``apps/v1`` Deployment, ``rbac.authorization.k8s.io/v1`` Role/RoleBinding, Service, Secret, and PVC @@ -94,6 +97,9 @@ The generated Helm chart does not run submitted jobs directly. It installs the parent participant process, its Kubernetes Service, its ServiceAccount, and the Role/RoleBinding that allow the launcher to create job pods. +When ``job_launcher.config_file_path`` is omitted or set to ``null``, the +launcher uses Kubernetes in-cluster config from the parent pod's ServiceAccount. + The parent Service is the stable in-cluster address for dynamically launched job pods. ``nvflare deploy prepare`` patches the prepared kit's internal communication settings to use the generated Service name and ``parent_port``. @@ -315,7 +321,9 @@ mounts ``parent.workspace_pvc`` at ``parent.workspace_mount_path``, but it does not upload files to the PVC. Copy the prepared kit's ``startup/`` and ``local/`` directories into the root of that workspace PVC before installing the chart. For server kits, also create or copy ``transfer/`` at the workspace root -for admin file-transfer storage. +for admin file-transfer storage. If you use ``kubectl cp`` as shown below, the +temporary copy pod image must contain ``tar`` because ``kubectl cp`` requires it +in the target container. Example ``workspace-pvc.yaml``: @@ -977,7 +985,8 @@ Check the parent logs for Kubernetes import or authorization failures: --as=system:serviceaccount:"$NAMESPACE":server If the logs show that the ``kubernetes`` Python package is missing, rebuild the -parent image with the NVFlare ``K8S`` extra or ``pip install kubernetes``. +parent image with the NVFlare ``K8S`` extra or +``pip install "kubernetes!=36.0.0"``. If the logs show ``SSLCertVerificationError`` with ``CA cert does not include key usage extension``, the parent Kubernetes client diff --git a/docs/user_guide/admin_guide/deployment/index.rst b/docs/user_guide/admin_guide/deployment/index.rst index a0e31bb8bd..fe657ebd11 100644 --- a/docs/user_guide/admin_guide/deployment/index.rst +++ b/docs/user_guide/admin_guide/deployment/index.rst @@ -13,6 +13,7 @@ Deployment Guide operation containerized_deployment helm_chart + openshift brev_scripted_deployment brev_deployment cloud_deployment diff --git a/docs/user_guide/admin_guide/deployment/openshift.rst b/docs/user_guide/admin_guide/deployment/openshift.rst new file mode 100644 index 0000000000..f9d31089de --- /dev/null +++ b/docs/user_guide/admin_guide/deployment/openshift.rst @@ -0,0 +1,31 @@ +.. _openshift_k8s_deployment: + +############################## +Deploying FLARE on OpenShift +############################## + +The OpenShift deployment guide and helper scripts now live in the DevOps +examples directory: + +``examples/devops/openshift`` + +Where to Start +============== + +Open ``examples/devops/openshift/README.md`` first for a concise folder +overview. It lists the Dockerfiles, helper scripts, and typical quickstart +commands. + +Open ``examples/devops/openshift/index.md`` for the full OpenShift deployment +guide. That source document covers prerequisites, image requirements, the +scripted workflow, manual deployment steps, OpenShift SCC notes, +troubleshooting, and cleanup. + +Run the scripts from the NVFlare repository root. For example: + +.. code-block:: bash + + bash examples/devops/openshift/scripts/k8s_e2e.sh + +The OpenShift example builds on the generic Kubernetes deployment runtime. See +:ref:`helm_chart` for the Kubernetes Helm chart workflow and runtime details. diff --git a/docs/user_guide/nvflare_cli/deploy_command.rst b/docs/user_guide/nvflare_cli/deploy_command.rst index 78593beab8..2de9fc7119 100644 --- a/docs/user_guide/nvflare_cli/deploy_command.rst +++ b/docs/user_guide/nvflare_cli/deploy_command.rst @@ -176,7 +176,8 @@ Top-level keys: ``job_launcher`` keys: - ``config_file_path``: kubeconfig path used by ``K8sJobLauncher``. Use - ``null`` for in-cluster config. + ``null`` for in-cluster config, where the Kubernetes Python client uses the + pod's ServiceAccount token. - ``pending_timeout``: seconds to wait for a job pod to leave ``Pending``. - ``default_python_path``: Python executable used in job pods unless a job overrides it with ``launcher_spec[site]["k8s"]["python_path"]``. Defaults to diff --git a/examples/README.md b/examples/README.md index 17aafaad13..02c28286ae 100644 --- a/examples/README.md +++ b/examples/README.md @@ -186,5 +186,6 @@ When you open a notebook, select the kernel `nvflare_example` using the dropdown | Example | Framework | Summary | |-------------------------------------------------------------|-----------|--------------------------------------------------------------------------------------------------------------------------| | [Docker Job Launcher](./docker/README.md) | NA | End-to-end Docker runtime example using `nvflare deploy prepare` and per-job Docker containers. | +| [OpenShift Deployment](./devops/openshift/README.md) | NA | OpenShift-specific deployment guide and helper scripts using the Kubernetes runtime support. | | [DevOps Deployment Examples](./devops/README.md) | NA | Test-only helper scripts for trying NVFlare deployment flows on Kubernetes and managed cloud clusters; not production deployment guidance. | | [Monitoring](./advanced/monitoring/README.md) | NA | FLARE Monitoring provides an initial solution for tracking system metrics of your federated learning jobs. | diff --git a/examples/devops/README.md b/examples/devops/README.md index 2e2984776d..c791e1eaa6 100644 --- a/examples/devops/README.md +++ b/examples/devops/README.md @@ -1,8 +1,8 @@ # NVFlare DevOps Examples This directory contains example scripts for quickly testing NVFlare deployment -flows on Kubernetes and managed cloud clusters. They are intended for local -development, smoke testing, demos, and learning. +flows on Kubernetes, OpenShift, and managed cloud clusters. They are intended +for local development, smoke testing, demos, and learning. These scripts are not production quality. They are not a hardened deployment blueprint and do not replace site-specific review for security, networking, @@ -11,10 +11,11 @@ operations. ## Scope -Use these examples to create temporary test clusters, build and push a test -NVFlare image, deploy a small NVFlare system, inspect it, and tear it down. -They assume you already have an NVFlare development environment and the -required cloud CLIs configured for the target accounts or projects. +Use these examples to create or target temporary test clusters, build and push +a test NVFlare image, deploy a small NVFlare system, inspect it, and tear it +down. They assume you already have an NVFlare development environment and the +required Kubernetes, OpenShift, or cloud CLIs configured for the target +clusters, accounts, or projects. Before running a deployment, copy or edit `examples/devops/multicloud/all-clouds.yaml` and replace the placeholder image tag, kubeconfig inputs, namespaces, storage @@ -24,6 +25,8 @@ classes, and participants for the clusters you want to test. - `multicloud/` - YAML-driven NVFlare deployment, status, dashboard, and image build/push helpers. +- `openshift/` - OpenShift-specific deployment guide and helper scripts using + the Kubernetes runtime support. - `gcp/gke/`, `aws/eks/`, `azure/aks/` - cloud cluster setup scripts and notes. - `examples/devops/.tmp/` - local generated kubeconfigs and state; not intended for commit. diff --git a/examples/devops/openshift/README.md b/examples/devops/openshift/README.md new file mode 100644 index 0000000000..1fe3fd6b96 --- /dev/null +++ b/examples/devops/openshift/README.md @@ -0,0 +1,77 @@ +# OpenShift Deployment Helpers + +This directory contains the OpenShift-specific NVFlare deployment guide and helper scripts. + +- [index.md](index.md) is the detailed OpenShift deployment guide. +- Repository `docker/Dockerfile.parent` builds the parent image used by server/client and admin pods. +- Repository `docker/Dockerfile.job` builds the workload image used by job pods. +- `scripts/create_openshift_cluster.sh` configures Red Hat OpenShift Local (CRC) and optionally starts it. +- `scripts/start_openshift_cluster.sh` starts CRC, logs in with `oc`, and prepares the target project. +- `scripts/cleanup_openshift_cluster.sh` deletes scripted deployment resources and stops CRC. +- `scripts/k8s_provision.sh` runs `nvflare provision` for the sample server, `site-1`, `site-2`, and admin. +- `scripts/k8s_deploy.sh` prepares K8s startup kits, stages PVC workspaces, installs Helm charts, and verifies parent pods can import the Kubernetes Python client. +- `scripts/k8s_submit_job.sh` submits `hello-numpy` from an in-cluster admin pod and waits for successful completion. +- `scripts/k8s_watch.sh` shows an in-place live Rich pod table for the created pods. +- `scripts/k8s_watch.py` implements the Rich table used by the shell wrapper. +- `scripts/k8s_e2e.sh` runs provision, deploy, and submit in order. + +## Create a Local OpenShift Cluster + +Use the CRC helper scripts only when you need a single-node Red Hat OpenShift +Local cluster for development or testing. Production OpenShift clusters are +platform-specific; create those with your organization's approved installer or +cloud service workflow, then use the deployment scripts here against that +cluster. + +Before using the local-cluster scripts, install Red Hat OpenShift Local so the +`crc` command is available, download your Red Hat OpenShift pull secret from +`https://console.redhat.com/openshift/create/local`, enable host hardware +virtualization, and make sure the host has enough CPU, memory, and disk for +OpenShift plus the NVFlare test pods. The create script defaults to 6 vCPUs, +24576 MiB memory, and 120 GiB disk. + +Use `scripts/create_openshift_cluster.sh` for first-time local CRC setup. It +validates that `crc` exists, requires `PULL_SECRET_FILE` when the cluster will +be started, writes CRC settings such as resource sizing and shared-directory +behavior, runs `crc setup`, and starts the cluster by delegating to +`scripts/start_openshift_cluster.sh` unless `START_AFTER_CREATE=false` is set. + +```bash +export PULL_SECRET_FILE="$HOME/Downloads/pull-secret.txt" +export NAMESPACE=nvflare-e2e + +bash examples/devops/openshift/scripts/create_openshift_cluster.sh +``` + +Use `scripts/start_openshift_cluster.sh` after CRC has already been configured, +or when restarting after `crc stop`. It runs `crc start` when needed, adds the +CRC-provided `oc` to `PATH` if needed, waits for OpenShift to report running, +logs in with `oc`, creates or selects `NAMESPACE`, and prints the console URL +and available StorageClasses. + +```bash +PULL_SECRET_FILE="$HOME/Downloads/pull-secret.txt" \ +bash examples/devops/openshift/scripts/start_openshift_cluster.sh +``` + +Run scripts from the repository root. Build the maintained images from `docker/Dockerfile.parent` and `docker/Dockerfile.job`, push them to a registry the cluster can pull from, then set `IMAGE` to the parent image and `JOB_IMAGE` to the workload image. `ADMIN_IMAGE` defaults to `IMAGE`, so the parent image can also be used for the temporary admin pod. The parent image needs NVFlare with the `K8S` extra/Kubernetes Python client. A custom `COPY_IMAGE` needs `sh`, `sleep`, and `tar`; `JOB_IMAGE` only needs `tar` when the job workload itself needs it. + +```bash +export IMAGE=registry.example.com/nvflare-parent:dev +export JOB_IMAGE=registry.example.com/nvflare-job:dev +export NAMESPACE=nvflare-e2e + +bash examples/devops/openshift/scripts/k8s_e2e.sh +``` + +The watch tool requires the Python `rich` package: + +```bash +python3 -m pip install rich +``` + +Clean up generated resources and stop OpenShift Local: + +```bash +bash examples/devops/openshift/scripts/cleanup_openshift_cluster.sh +``` diff --git a/examples/devops/openshift/index.md b/examples/devops/openshift/index.md new file mode 100644 index 0000000000..c9189860ff --- /dev/null +++ b/examples/devops/openshift/index.md @@ -0,0 +1,728 @@ +# Deploying FLARE on OpenShift + +This guide shows how to deploy NVIDIA FLARE on an existing OpenShift Kubernetes cluster. It uses the standard Kubernetes runtime support generated by `nvflare deploy prepare` and the built-in `ServerK8sJobLauncher` and `ClientK8sJobLauncher` for submitted jobs. + +OpenShift is Kubernetes, but with additional admission and runtime security controls. The important OpenShift-specific work is to use an image that can run under the default restricted Security Context Constraint (SCC), create or select a usable storage class, and make sure all registry credentials exist in the target namespace before installing FLARE. + +## Directory Layout + +All OpenShift-specific deployment material is kept under this directory: + +``` text +examples/devops/openshift/ + index.md + README.md + scripts/ + create_openshift_cluster.sh + start_openshift_cluster.sh + cleanup_openshift_cluster.sh + k8s_provision.sh + k8s_deploy.sh + k8s_submit_job.sh + k8s_watch.sh + k8s_watch.py + k8s_e2e.sh + k8s_common.sh +``` + +Run the scripts from the NVFlare repository root unless a command explicitly states otherwise. The scripts share defaults through `scripts/k8s_common.sh`; that common file is an implementation detail and is not meant to be run directly. + +## Runtime Model + +FLARE runs in two layers: + + - A **parent pod** runs the long-lived FLARE server or client process. The generated Helm chart starts this pod from the prepared startup kit and mounts a workspace PVC at `/var/tmp/nvflare/workspace` by default. + - A **job pod** is created dynamically by the existing K8s job launcher when an admin submits a job. Do not create these job pods yourself. Specify their image and resources in the job's `launcher_spec`. + +For more detail on the generic Kubernetes runtime model, see `docs/user_guide/admin_guide/deployment/helm_chart.rst`. + +## Prerequisites + +Have these ready before running the scripted workflow. + +Cluster and access: + + - An OpenShift cluster, or Red Hat OpenShift Local for development testing. + - `oc` logged in to the target cluster with permission to create or update the target namespace, PVCs, Services, Deployments, ServiceAccounts, Roles, and RoleBindings. + - `helm` installed locally and able to install charts into the target namespace. + - A working default StorageClass, or a StorageClass name to pass as `STORAGE_CLASS`. + - Enough quota for three long-running parent pods, one temporary admin pod, three short-lived job pods, and one workspace PVC per server/client participant. + +Local tooling: + + - A local NVFlare source checkout with NVFlare installed so the `nvflare` CLI can run `nvflare provision`, `nvflare deploy prepare`, and job export code from this repository. + - The Python `rich` package for the live pod table tool. + - `python3` and `tar` available on the local machine. `oc cp` uses local `tar` when staging prepared startup kits into PVCs. The submit script also uses local `tar` when streaming the admin startup kit and exported job into the in-cluster admin pod. + - For the local CRC helper scripts only: the `crc` command, hardware virtualization, and a Red Hat OpenShift pull secret. + +Images and registry: + + - A parent image that OpenShift nodes can pull. Set this image as `IMAGE`. It must contain this NVFlare version, Python, and NVFlare's Kubernetes dependency support because parent pods create job pods through the Kubernetes Python client. The deploy script verifies this by importing `kubernetes` inside each rolled-out parent pod. + - An admin image for the temporary in-cluster admin pod. `ADMIN_IMAGE` defaults to `IMAGE`, so the parent image can be used for this pod. It must contain NVFlare and the Python executable named by `ADMIN_PYTHON_PATH`. + - A job image for submitted workload pods. Set this image as `JOB_IMAGE` when it is different from `IMAGE`. The `hello-numpy` workflow needs NVFlare, Python, `numpy`, and the runtime tools needed by the job. + - Images must run under OpenShift's restricted SCC. In practice, they should support arbitrary UIDs and root-group writable application/workspace paths. + - If the registry is private, create image pull Secrets in the target namespace before deployment and pass their names with `PARENT_IMAGE_PULL_SECRETS` and `JOB_IMAGE_PULL_SECRETS`. + +## Host Support Matrix + +The helper scripts are Bash scripts. Run them with `bash` as shown in this guide; they are not supported under `sh`, `dash`, `zsh`, PowerShell, or `cmd.exe`. The scripts avoid Bash 4-only syntax and GNU-only coreutils so they can run with macOS's system Bash 3.2 and BSD userlands, provided the required external commands are installed. + +| Workflow | Host OS / Platform | Shell | Support and requirements | +| --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------- | ----------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Deploy to an existing OpenShift cluster: `k8s_provision.sh`, `k8s_deploy.sh`, `k8s_submit_job.sh`, and `k8s_e2e.sh` | Linux, macOS, or BSD | Bash 3.2 or newer | Supported when `python3`, `tar`, `oc`, `helm`, and `nvflare` are installed and can reach the target cluster. | +| Watch created pods: `k8s_watch.sh` and `k8s_watch.py` | Linux, macOS, or BSD | Bash 3.2 or newer for the wrapper; Python 3 for the watcher | Supported when `python3`, the Python `rich` package, and `oc` (or `KUBE_CMD`) are installed. | +| Create or start a local OpenShift Local / CRC cluster: `create_openshift_cluster.sh` and `start_openshift_cluster.sh` | Linux or macOS hosts where Red Hat OpenShift Local is installed and works | Bash 3.2 or newer | Supported by these scripts when `crc` and `oc` are available. Generic BSD hosts are not supported for this workflow unless the `crc` tool itself supports that host. | +| Native Windows shell | Windows without WSL or another Unix-like Bash environment | PowerShell, `cmd.exe`, Git Bash, or MSYS Bash | Not supported by these scripts. Use a Linux/macOS/BSD host or a Linux environment with the required Kubernetes/OpenShift tools. | +| POSIX shell invocation | Any host | `sh`, `dash`, or another POSIX-only shell | Not supported. The scripts intentionally use Bash features such as arrays, `[[ ... ]]`, `(( ... ))`, `pipefail`, and `BASH_SOURCE`. | + +The command examples in this guide avoid Linux-only utilities such as `getent` and GNU-only options such as `find -printf`. Commands implemented by external tools, such as `oc --sort-by`, `oc wait --timeout`, `helm upgrade --install`, and `crc start` flags, depend on those tools rather than the host operating system's core utilities. + +## Create a Local OpenShift Cluster + +If you do not already have an OpenShift cluster, the helper scripts can create and start a single-node Red Hat OpenShift Local cluster for development and testing. Production OpenShift clusters are platform-specific; create those with your organization's approved installer or cloud service workflow, then continue with the deployment steps below. + +Scripts: + + - [create_openshift_cluster.sh](scripts/create_openshift_cluster.sh) + - [start_openshift_cluster.sh](scripts/start_openshift_cluster.sh) + +### Local Cluster Prerequisites + +Install and prepare: + + - Red Hat OpenShift Local, which provides the `crc` command. + - A Red Hat account and the OpenShift pull secret from `https://console.redhat.com/openshift/create/local`. + - Hardware virtualization enabled on the host. The [OpenShift Local documentation](https://crc.dev/docs/getting-started/) lists the platform-specific minimum CPU, memory, disk, and hypervisor requirements. + - Enough host capacity for OpenShift plus FLARE test pods. The script defaults to `6` vCPUs, `24576` MiB memory, and `120` GiB disk; lower values may work for only the cluster but can be tight for the end-to-end FLARE workflow. + - Network access to download the OpenShift Local bundle and pull cluster images, unless you provide a local bundle with `CRC_BUNDLE`. + - Local administrator privileges when `crc setup` needs to configure host networking, DNS, or virtualization support. + +Create the CRC configuration and start the first cluster instance: + +``` bash +export PULL_SECRET_FILE="$HOME/Downloads/pull-secret.txt" +export NAMESPACE=nvflare-e2e + +bash examples/devops/openshift/scripts/create_openshift_cluster.sh +``` + +OpenShift Local creates the VM during the first `crc start`. The create script applies the CRC configuration, runs `crc setup`, then delegates to the start script. + +What `create_openshift_cluster.sh` does: + + - validates that `crc` exists and that `PULL_SECRET_FILE` is set when the cluster will be started; + - writes CRC settings for CPU, memory, disk, preset, proxy, bundle, update checks, cluster monitoring, and shared-directory behavior; + - runs `crc setup` to prepare host networking and virtualization support; + - starts the cluster by calling `start_openshift_cluster.sh` unless `START_AFTER_CREATE=false` is set. + +Expected result: + + - with the default `START_AFTER_CREATE=true`, `crc status` eventually shows `CRC VM: Running` and `OpenShift: Running`; + - `oc whoami` reports the configured OpenShift user, `developer` by default; + - the namespace named by `NAMESPACE` exists or is selected; + - the command prints the OpenShift console URL and available StorageClasses. + +To configure the host without starting the cluster: + +``` bash +START_AFTER_CREATE=false \ +bash examples/devops/openshift/scripts/create_openshift_cluster.sh +``` + +Start the cluster later, or restart it after `crc stop`: + +``` bash +PULL_SECRET_FILE="$HOME/Downloads/pull-secret.txt" \ +bash examples/devops/openshift/scripts/start_openshift_cluster.sh +``` + +The start script runs `crc start` when needed, adds the CRC-provided `oc` to `PATH` if `oc` is not already installed, waits for `crc status` to report `OpenShift: Running`, logs in to `https://api.crc.testing:6443` as `developer/developer` by default, creates or selects `NAMESPACE`, and prints the console URL and StorageClasses. + +What `start_openshift_cluster.sh` does: + + - starts the CRC VM if it is not already running; + - waits until the OpenShift API inside the VM reports `Running`; + - logs in with `oc login` unless `LOGIN_OPENSHIFT=false`; + - creates or selects `NAMESPACE` unless `CREATE_PROJECT=false`; + - prints basic cluster context and storage information for the later scripts. + +Expected result: + + - `crc status` reports `OpenShift: Running`; + - `oc whoami` succeeds; + - `oc project` points at `NAMESPACE` when project creation/selection is enabled. + +### Local Cluster Environment + +Common variables for `create_openshift_cluster.sh`: + +| Variable | Purpose | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `PULL_SECRET_FILE` | Path to the OpenShift pull secret. Required for the first non-interactive `crc start`. | +| `CRC_CPUS` | vCPU count written to CRC config. Default: `6`. | +| `CRC_MEMORY` | Memory in MiB written to CRC config. Default: `24576`. | +| `CRC_DISK_SIZE` | Disk size in GiB written to CRC config. Default: `120`. | +| `CRC_ENABLE_CLUSTER_MONITORING` | Enable OpenShift cluster monitoring before setup. Default: `false`. | +| `CRC_BUNDLE` | Optional local file path or remote bundle reference. | +| `CRC_HTTP_PROXY`, `CRC_HTTPS_PROXY`, `CRC_NO_PROXY` | Optional proxy settings written to CRC config. | +| `CRC_ENABLE_SHARED_DIRS` | Mount the host home directory into the CRC VM. Default: `false` for this workflow because nonstandard home paths such as `/localhome/` can fail during CRC shared-directory setup. | +| `START_AFTER_CREATE` | Start the cluster after `crc setup`. Default: `true`. | + +Common variables for `start_openshift_cluster.sh`: + +| Variable | Purpose | +| ----------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `CRC_DISABLE_UPDATE_CHECK` | Pass `--disable-update-check` to `crc start`. Default: `true`. | +| `CRC_NAMESERVER` | Optional IPv4 DNS server passed to `crc start`. | +| `CRC_ENABLE_SHARED_DIRS` | Mount the host home directory into the CRC VM. Default: `false`. | +| `CRC_OPENSHIFT_READY_TIMEOUT` | Seconds to wait for `crc status` to report `OpenShift: Running` before login. Default: `900`. | +| `CRC_OPENSHIFT_READY_INTERVAL` | Seconds between `crc status` checks. Default: `10`. | +| `LOGIN_OPENSHIFT` | Log in with `oc login` after start. Default: `true`. | +| `OPENSHIFT_API_URL` | API endpoint. Default: `https://api.crc.testing:6443`. | +| `OPENSHIFT_USER` and `OPENSHIFT_PASSWORD` | Login credentials. Default: `developer` / `developer`. Use `crc console --credentials` to retrieve the generated `kubeadmin` password if you prefer that account. | +| `CREATE_PROJECT` | Create or select the project named by `NAMESPACE`. Default: `true`. | +| `NAMESPACE` | Project used by the scripted FLARE deployment. Default: `nvflare-e2e`. | + +## Build an OpenShift-Compatible Image + +The maintained NVFlare Dockerfiles are in the repository `docker/` directory. Use the parent image for long-running server/client parent pods and the temporary admin pod. Use a job image for the submitted `hello-numpy` job pods: + +``` bash +export PARENT_IMAGE=registry.example.com/nvflare-parent:dev +export WORKLOAD_IMAGE=registry.example.com/nvflare-job:dev + +docker build -t "$PARENT_IMAGE" -f docker/Dockerfile.parent . +docker build -t "$WORKLOAD_IMAGE" -f docker/Dockerfile.job . + +docker push "$PARENT_IMAGE" +docker push "$WORKLOAD_IMAGE" +``` + +Set the OpenShift workflow image variables from those pushed images: + +``` bash +export IMAGE="$PARENT_IMAGE" +export JOB_IMAGE="$WORKLOAD_IMAGE" +``` + +`docker/Dockerfile.parent` installs NVFlare with the `K8S` extra because parent processes create job pods with the Kubernetes Python client. Its final stage is a minimal distroless Python image. The scripted submit phase can use this image as `ADMIN_IMAGE` because the admin pod starts with Python and the script stages files through Python instead of requiring shell utilities or `tar` inside the admin container. + +`docker/Dockerfile.job` installs NVFlare into an NGC PyTorch runtime image. For this scripted workflow it is suitable as the `hello-numpy` job image. If your cluster cannot pull the default `COPY_IMAGE` from Docker Hub, set `COPY_IMAGE` to an image that contains `sh`, `sleep`, and `tar`. + +If you use one custom all-purpose image instead of separate parent and workload images, leave `JOB_IMAGE` unset and set `IMAGE` to that image. It must satisfy the parent, admin, and job requirements. + +If you use your own Dockerfile, make sure it supports arbitrary UIDs. OpenShift may run the container with a generated UID from the project range, so the image must not require a fixed user or root-owned writable directories. + +## Create Image Pull Secrets + +If your registry is private, create an image pull Secret before running the deployment: + +``` bash +export NAMESPACE=nvflare-e2e + +oc new-project "$NAMESPACE" +oc -n "$NAMESPACE" create secret docker-registry registry-credentials \ + --docker-server=registry.example.com \ + --docker-username="$REGISTRY_USER" \ + --docker-password="$REGISTRY_PASSWORD" \ + --docker-email="$REGISTRY_EMAIL" +``` + +Pass the Secret name to deployment preparation: + +``` bash +export PARENT_IMAGE_PULL_SECRETS=registry-credentials +export JOB_IMAGE_PULL_SECRETS=registry-credentials +``` + +## Scripted Workflow + +The scripted workflow is split into three phases so provisioning, deployment, and job execution can be verified independently: + + - [k8s_provision.sh](scripts/k8s_provision.sh) + - [k8s_deploy.sh](scripts/k8s_deploy.sh) + - [k8s_submit_job.sh](scripts/k8s_submit_job.sh) + +Two additional helpers are provided: + + - [k8s_watch.sh](scripts/k8s_watch.sh) shows an in-place live concise pod table for pods created by the scripts. + - [k8s_e2e.sh](scripts/k8s_e2e.sh) runs the three phase scripts in order. + +### Shared Script Environment + +The phase scripts share these common variables: + +| Variable | Purpose | +| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `KUBE_CMD` | Kubernetes CLI command. Default: `oc`. | +| `NAMESPACE` | OpenShift project/namespace used by the deployment. Default: `nvflare-e2e`. | +| `PROJECT_NAME` | NVFlare provisioned project name. Default: `openshift_nvflare_e2e`. | +| `SERVER_NAME` | Server participant, Helm release, Deployment, and Service base name. Default: `nvflare-server`. | +| `SERVER_HOST` | Host written into the startup kits. Default: `nvflare-server`, which resolves as the in-namespace Service name. | +| `CLIENTS` | Space-separated client names. Default: `site-1 site-2`. | +| `ADMIN_USER` | Admin participant name. Default: `admin@nvidia.com`. | +| `WORK_DIR` | Local working directory for generated project files, startup kits, prepared kits, job files, and the last job ID. Default: `/tmp/nvflare/openshift-e2e`. | +| `IMAGE` | Required by the deployment phase. Cluster-pullable parent image with NVFlare, the `K8S` extra, and the Python executable named by `PARENT_PYTHON_PATH`. | +| `COPY_IMAGE` | Image for temporary PVC copy pods. Default: `busybox:1.36`. Set this to an internal image if your cluster cannot pull from Docker Hub. The image must contain `sh`, `sleep`, and `tar` because the scripts use `oc cp` to stage files into PVCs. | +| `STORAGE_CLASS` | Optional StorageClass for generated workspace PVCs. Leave unset to use the cluster default. | +| `WORKSPACE_STORAGE` | Per-participant workspace PVC request. Default: `2Gi`. | +| `PARENT_CPU` and `PARENT_MEMORY` | Optional resource requests for long-running parent pods, for example `500m` and `1Gi`. | +| `PARENT_PYTHON_PATH` | Python command used by parent pods. Default: `python`, matching `docker/Dockerfile.parent`. | +| `ADMIN_PYTHON_PATH` | Python command used by the temporary admin pod. Default: `PARENT_PYTHON_PATH`. | +| `JOB_IMAGE` | Image for dynamically created job pods. Default: `IMAGE`. Set this to the workload image when `IMAGE` is parent-only. | +| `ADMIN_IMAGE` | Image for the temporary in-cluster admin pod. Default: `IMAGE`. The parent image can be used when it contains NVFlare and the Python executable named by `ADMIN_PYTHON_PATH`. | +| `PARENT_IMAGE_PULL_SECRETS` and `JOB_IMAGE_PULL_SECRETS` | Space-separated image pull Secret names that already exist in `NAMESPACE`. | + +### Provision Startup Kits + +Run: + +``` bash +bash examples/devops/openshift/scripts/k8s_provision.sh +``` + +Prerequisites: + + - `nvflare` and `python3` are available locally. + - The command is run from a checkout where the local `nvflare` command imports this repository's code. + - `WORK_DIR` is writable. By default this script deletes and recreates `/tmp/nvflare/openshift-e2e`. To reuse the directory, set `CLEAN_WORK_DIR=false`. + +What it does: + + - writes `$WORK_DIR/project.yml` for one server, two clients, and one admin; + - sets the server host to `SERVER_HOST` so the generated kits point at the OpenShift Service by default; + - runs `nvflare provision -p "$WORK_DIR/project.yml" -w "$WORK_DIR/workspace" --force`; + - verifies that the generated production directory exists. + +Expected result: + +``` text +/tmp/nvflare/openshift-e2e/workspace/openshift_nvflare_e2e/prod_00/ + admin@nvidia.com/ + nvflare-server/ + site-1/ + site-2/ +``` + +The script prints the generated participant folder names. It does not contact OpenShift and does not require `IMAGE`. + +### Deploy Parent Pods + +Run: + +``` bash +export IMAGE=registry.example.com/nvflare-parent:dev +export NAMESPACE=nvflare-e2e + +bash examples/devops/openshift/scripts/k8s_deploy.sh +``` + +Prerequisites: + + - `k8s_provision.sh` completed successfully with the same `WORK_DIR`, `PROJECT_NAME`, `SERVER_NAME`, `CLIENTS`, and `ADMIN_USER` settings. + - `oc` is logged in and can create or update resources in `NAMESPACE`. + - `helm` is installed locally. + - `IMAGE` is set to a parent image that the cluster can pull and run under the restricted SCC. + - The namespace has registry pull Secrets if the image is private. + - The cluster has a default StorageClass, or `STORAGE_CLASS` is set. + - `COPY_IMAGE` is pullable by the cluster and contains `sh`, `sleep`, and `tar`. `oc cp` requires `tar` in the target container when staging prepared startup files into workspace PVCs. + +What it does: + + - runs `nvflare deploy prepare` for `nvflare-server`, `site-1`, and `site-2` using `runtime: k8s`; + - verifies that server and client prepared kits contain the Kubernetes job launcher components configured for in-cluster ServiceAccount auth; + - creates `NAMESPACE` if it does not already exist; + - creates one workspace PVC per server/client participant; + - creates temporary copy pods, waits for PVC binding, and copies each prepared kit's `startup/` and `local/` directories into its PVC; + - installs or upgrades the generated Helm chart for each participant; + - restarts existing Deployments after an upgrade so repeated runs pick up new startup-kit content from the PVC; + - waits for all parent Deployments to roll out; + - verifies that each parent pod can import the Kubernetes Python client needed by the launcher. + +Expected result: + +``` bash +oc -n nvflare-e2e get deploy,pods,svc,pvc +``` + +The server and client Deployments should show `1/1` ready. The parent pods should be `Running`, Services should exist for the server and clients, and all workspace PVCs should be `Bound`. + +### Submit and Verify a Job + +Run: + +``` bash +export IMAGE=registry.example.com/nvflare-parent:dev +export JOB_IMAGE=registry.example.com/nvflare-job:dev +export NAMESPACE=nvflare-e2e + +bash examples/devops/openshift/scripts/k8s_submit_job.sh +``` + +Prerequisites: + + - `k8s_deploy.sh` completed successfully and the parent server and client Deployments are ready. + - `ADMIN_IMAGE` or `IMAGE` contains NVFlare and the Python executable named by `ADMIN_PYTHON_PATH`. The default `ADMIN_IMAGE=IMAGE` can use the parent image. + - `JOB_IMAGE` or `IMAGE` contains NVFlare, Python, `numpy`, and the runtime tools needed by the job pods. + - The generated ServiceAccounts and RBAC from `nvflare deploy prepare` are still present in the namespace. + +What it does: + + - verifies the parent Deployments are rolled out; + - exports the repository's `examples/hello-world/hello-numpy` job to `$WORK_DIR/jobs/hello-numpy-k8s`; + - writes `launcher_spec.default.k8s` into the job's `meta.json` with `JOB_IMAGE`, Python path, and optional resource settings; + - starts a temporary `nvflare-admin` pod inside the namespace; + - copies the admin startup kit and exported job into that pod; + - submits the job from inside the cluster; + - waits until the Kubernetes launcher creates at least one server job pod and one job pod per client; + - waits for `nvflare job wait` to report `FINISHED:COMPLETED`; + - stores the submitted job ID in `$WORK_DIR/last_job_id` for the watch script. + +Expected result: + + - the command prints a JSON result whose `data.status` is `FINISHED:COMPLETED`; + - `oc -n "$NAMESPACE" get pods` shows the temporary `nvflare-admin` pod plus completed job pods whose names start with the normalized job ID; + - the long-running parent pods remain `Running`. + +### Watch Resource State + +Run once: + +``` bash +bash examples/devops/openshift/scripts/k8s_watch.sh --once +``` + +Run as a live view: + +``` bash +bash examples/devops/openshift/scripts/k8s_watch.sh --interval 3 +``` + +Prerequisites: + + - `oc` can read resources in `NAMESPACE`. + - `python3` can import the `rich` package. + +What it does: + + - prints the namespace, work directory, and UTC timestamp; + - prints the last submitted job ID when `$WORK_DIR/last_job_id` exists; + - shows a Rich pod table sorted by creation time; + - refreshes the same terminal block in place until interrupted unless `--once` is set. + +Expected result: + +The live table should show long-running parent pods as `Running` and launcher-created job pods as `Completed` after the submit phase finishes. The table uses the standard concise pod columns: name, readiness, status, restarts, and age. + +### Run the Whole Workflow + +Run: + +``` bash +export IMAGE=registry.example.com/nvflare-parent:dev +export JOB_IMAGE=registry.example.com/nvflare-job:dev +export NAMESPACE=nvflare-e2e + +bash examples/devops/openshift/scripts/k8s_e2e.sh +``` + +Prerequisites: + + - all prerequisites for the provision, deploy, and submit scripts are satisfied; + - `IMAGE` is set before the deploy phase starts; + - `JOB_IMAGE` is set when `IMAGE` is parent-only. + +What it does: + + - runs `k8s_provision.sh` with `CLEAN_WORK_DIR=true` by default; + - runs `k8s_deploy.sh` against the generated startup kits; + - runs `k8s_submit_job.sh` against the deployed parent pods. + +Expected result: + +The final phase should report `FINISHED:COMPLETED`. The namespace should contain ready parent pods and completed launcher-created job pods. If the workflow is repeated against an existing namespace, the deploy phase restarts existing Deployments so they read the newly generated startup-kit content. + +If your cluster has no default StorageClass, pass `STORAGE_CLASS`: + +``` bash +STORAGE_CLASS=ocs-storagecluster-ceph-rbd \ +IMAGE="$IMAGE" \ +JOB_IMAGE="$JOB_IMAGE" \ +bash examples/devops/openshift/scripts/k8s_e2e.sh +``` + +The scripted path intentionally submits the job from inside the cluster. This keeps the quickstart independent of external Routes, load balancers, and TCP ingress configuration. + +## Manual Deployment + +Use this path when you want to adapt the generated artifacts to your own organization layout. + +### Provision Participants + +Use either standard centralized provisioning: + +``` bash +nvflare provision -p project.yml -w workspace +``` + +or distributed provisioning with `nvflare cert` and `nvflare package`. The result is a normal production directory such as: + +``` text +workspace//prod_00/ + nvflare-server/ + site-1/ + site-2/ + admin@nvidia.com/ +``` + +### Prepare K8s Startup Kits + +Create a Kubernetes runtime config for each participant. Use a unique workspace PVC per participant unless your storage class and operational model support safe shared access. + +Example `k8s.yaml`: + +``` yaml +runtime: k8s +namespace: nvflare +server_service_name: nvflare-server +parent: + docker_image: registry.example.com/nvflare-parent:dev + image_pull_secrets: + - registry-credentials + parent_port: 8102 + workspace_pvc: nvflare-ws-server + workspace_mount_path: /var/tmp/nvflare/workspace + python_path: python +job_launcher: + config_file_path: + default_python_path: /usr/local/bin/python3 + image_pull_secrets: + - registry-credentials + pending_timeout: 300 +``` + +Prepare each server or client: + +``` bash +nvflare deploy prepare workspace//prod_00/nvflare-server \ + --output /tmp/nvflare-prepared/nvflare-server \ + --config k8s-server.yaml + +nvflare deploy prepare workspace//prod_00/site-1 \ + --output /tmp/nvflare-prepared/site-1 \ + --config k8s-site-1.yaml +``` + +After preparation, verify that `local/resources.json.default` contains `k8s_launcher`. Server kits should use `nvflare.app_opt.job_launcher.k8s_launcher.ServerK8sJobLauncher` and client kits should use `nvflare.app_opt.job_launcher.k8s_launcher.ClientK8sJobLauncher`. For in-cluster deployments, `config_file_path` should be `null` or empty so the launcher uses the parent pod's ServiceAccount token. NVFlare handles Kubernetes Python client 36.x token values that already include the `bearer` scheme prefix. + +### Create PVCs and Stage Kits + +Create the namespace and workspace PVCs: + +``` bash +oc new-project nvflare +oc -n nvflare apply -f workspace-pvc.yaml +oc -n nvflare get pvc +``` + +The Helm chart mounts the workspace PVC, but it does not upload the prepared startup kit. Copy the prepared `startup/` and `local/` directories into the PVC root before installing the chart. One common method is a temporary copy pod. The copy pod image must contain `tar` because `oc cp` requires it in the target container: + +``` bash +export NAMESPACE=nvflare +export WORKSPACE_PVC=nvflare-ws-server +export PREPARED_KIT=/tmp/nvflare-prepared/nvflare-server + +oc -n "$NAMESPACE" delete pod nvflare-pvc-copy --ignore-not-found=true +oc -n "$NAMESPACE" run nvflare-pvc-copy \ + --image=busybox:1.36 \ + --restart=Never \ + --overrides='{ + "spec": { + "volumes": [ + {"name": "ws", "persistentVolumeClaim": {"claimName": "'"${WORKSPACE_PVC}"'"}} + ], + "containers": [ + { + "name": "copy", + "image": "busybox:1.36", + "command": ["sh", "-c", "sleep 3600"], + "volumeMounts": [{"name": "ws", "mountPath": "/mnt/ws"}] + } + ] + } + }' +oc -n "$NAMESPACE" wait --for=condition=Ready pod/nvflare-pvc-copy --timeout=120s +oc -n "$NAMESPACE" exec nvflare-pvc-copy -- rm -rf /mnt/ws/startup /mnt/ws/local +oc -n "$NAMESPACE" exec nvflare-pvc-copy -- mkdir -p /mnt/ws/startup /mnt/ws/local +oc -n "$NAMESPACE" cp "$PREPARED_KIT/startup/." nvflare-pvc-copy:/mnt/ws/startup +oc -n "$NAMESPACE" cp "$PREPARED_KIT/local/." nvflare-pvc-copy:/mnt/ws/local +oc -n "$NAMESPACE" delete pod nvflare-pvc-copy +``` + +Repeat for every participant, using that participant's prepared kit and workspace PVC. + +### Install Helm Charts + +Install the generated chart for each participant: + +``` bash +helm upgrade --install nvflare-server \ + /tmp/nvflare-prepared/nvflare-server/helm_chart \ + --namespace nvflare + +helm upgrade --install site-1 \ + /tmp/nvflare-prepared/site-1/helm_chart \ + --namespace nvflare +``` + +Wait for rollouts: + +``` bash +oc -n nvflare rollout status deployment/nvflare-server --timeout=300s +oc -n nvflare rollout status deployment/site-1 --timeout=300s +oc -n nvflare get pods,svc,pvc +``` + +## Submit a Job + +The job pod image belongs in the submitted job's `meta.json`. The launcher uses `launcher_spec` to create the job pods: + +``` json +{ + "launcher_spec": { + "default": { + "k8s": { + "image": "registry.example.com/nvflare-job:dev", + "python_path": "/usr/local/bin/python3", + "cpu": "1", + "memory": "2Gi", + "ephemeral_storage": "1Gi" + } + } + } +} +``` + +Submit from a machine that can reach the FLARE admin endpoint: + +``` bash +nvflare job submit -j /path/to/job \ + --startup-kit workspace//prod_00/admin@nvidia.com +``` + +For an in-cluster-only deployment, run the admin CLI in a temporary pod using an image that contains NVFlare and Python. The scripted quickstart can use the parent image for this pod. If you manually copy the admin startup kit or job into that pod with `oc cp`, `tar` must be present in the pod image. + +## Expose Admin and FL Traffic + +The scripted quickstart avoids external exposure by running the admin pod inside the same namespace. For production, expose the server only through your organization's approved TCP routing pattern. + +The generated server Service is a normal Kubernetes Service. OpenShift Routes are HTTP/TLS-oriented and are not generally a drop-in replacement for FLARE's raw TCP FL/admin ports. Use an approved TCP ingress/load balancer pattern, or run admins and clients in-cluster. The server host used during provisioning must resolve to the address clients and admins actually use. + +## OpenShift SCC Notes + +OpenShift's default restricted SCC is stricter than many Kubernetes clusters. For FLARE deployments: + + - Parent pods must run with the generated service account and must be able to read/write the mounted workspace PVC. + - Job pods created by the K8s launcher must run under the same namespace SCC restrictions. + - Admin pods used for automation must also run without root-only assumptions. + - Images should support arbitrary UIDs and root-group writable application and workspace paths. + - Avoid host networking, host paths, privileged containers, and host ports unless your cluster administrator explicitly grants them. + +If a pod is rejected or fails immediately, inspect the SCC and pod events: + +``` bash +oc -n nvflare get pod -o jsonpath='{.metadata.annotations.openshift\.io/scc}{"\n"}' +oc -n nvflare describe pod +oc -n nvflare get events --sort-by=.lastTimestamp +``` + +Prefer fixing the image to run under the restricted SCC. Binding broader SCCs such as `anyuid` is a cluster security exception and should be reviewed by the cluster administrator. + +## Troubleshooting + +### OpenShift Local reports `Unreachable` + +If `crc status` shows `CRC VM: Running` but `OpenShift: Unreachable`, the VM is up but the OpenShift API inside it is not ready or failed during startup. CRC does not provide a `crc logs` command in current builds. Use the CRC log file or restart with debug logging: + +``` bash +crc status +tail -n 200 "$HOME/.crc/crc.log" + +crc stop +crc start --pull-secret-file "$PULL_SECRET_FILE" --disable-update-check \ + --log-level debug +``` + +Also verify local DNS for the CRC API endpoint: + +``` bash +python3 -c 'import socket; print(socket.gethostbyname("api.crc.testing"))' +curl -k https://api.crc.testing:6443/readyz?verbose +``` + +If `api.crc.testing` does not resolve, rerun `crc setup` and restart CRC. If DNS resolves but the API stays unreachable, inspect `$HOME/.crc/crc.log` for image pull failures, host networking/DNS failures, disk pressure, or OpenShift service startup errors. + +If `crc.log` fails during `Configuring shared directories` and your host home directory is outside a standard local home path, disable CRC shared directories and restart the VM: + +``` bash +export PULL_SECRET_FILE="$HOME/Downloads/pull-secret.txt" +export CRC_ENABLE_SHARED_DIRS=false + +crc stop || true +crc config set enable-shared-dirs false +bash examples/devops/openshift/scripts/start_openshift_cluster.sh +``` + +The create and start scripts default `CRC_ENABLE_SHARED_DIRS` to `false`. Do not override `HOME` for `crc` unless you also set up CRC's daemon under that same home directory. + +### Pods remain Pending + +Check PVC binding, image pull failures, resource requests, and scheduler events: + +``` bash +oc -n nvflare get pods,pvc +oc -n nvflare describe pod +oc -n nvflare get events --sort-by=.lastTimestamp +``` + +### Job shows `FINISHED:EXECUTION_EXCEPTION` + +The K8s launcher deletes a job pod if it remains `Pending` or `Unknown` longer than `job_launcher.pending_timeout`. Inspect the job pod events and the parent pod logs: + +``` bash +oc -n nvflare get pods +oc -n nvflare logs deploy/nvflare-server --tail=200 +oc -n nvflare logs deploy/site-1 --tail=200 +``` + +### Admin cannot connect + +Confirm that the admin startup kit was packaged for the endpoint you exposed. If you changed external DNS or ports after provisioning, regenerate the affected signed zips or centralized startup kits. + +### Clients cannot connect + +Verify that the server host in the client startup kit resolves from the client runtime environment. In single-namespace deployments, using the server Service name, for example `nvflare-server`, is the simplest path. + +## Clean Up + +To delete scripted deployment resources and stop OpenShift Local: + +``` bash +bash examples/devops/openshift/scripts/cleanup_openshift_cluster.sh +``` + +By default, the cleanup script removes the generated Helm releases, temporary admin/copy pods, the last submitted job pods recorded under `WORK_DIR`, and workspace PVCs in `NAMESPACE` before running `crc stop`. + +Delete the namespace only if it is dedicated to this test. This removes all resources in the project before stopping CRC: + +``` bash +bash examples/devops/openshift/scripts/cleanup_openshift_cluster.sh --delete-namespace +``` + +Useful cleanup options: + + - `--no-stop`: delete resources but leave CRC running. + - `--delete-work-dir`: also delete `WORK_DIR` when it is under `/tmp/nvflare`. + - `--keep-namespace`: keep the namespace even when `DELETE_NAMESPACE=true` is set in the environment. + +Depending on the storage reclaim policy, PVC-backed volumes may remain after PVCs or the namespace are deleted. diff --git a/examples/devops/openshift/scripts/cleanup_openshift_cluster.sh b/examples/devops/openshift/scripts/cleanup_openshift_cluster.sh new file mode 100755 index 0000000000..db40a498f9 --- /dev/null +++ b/examples/devops/openshift/scripts/cleanup_openshift_cluster.sh @@ -0,0 +1,247 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Delete NVFlare OpenShift scripted deployment resources and stop Red Hat +OpenShift Local (CRC). + +By default this script deletes the resources created by the scripted NVFlare +workflow in NAMESPACE, then runs `crc stop`. Use --delete-namespace only when +the project is dedicated to this test; it deletes everything in that namespace. + +Usage: + bash cleanup_openshift_cluster.sh [--delete-namespace] [--no-stop] [--delete-work-dir] + +Common environment: + KUBE_CMD=oc + HELM_BIN=helm + CRC_BIN=crc + NAMESPACE=nvflare-e2e + SERVER_NAME=nvflare-server + CLIENTS="site-1 site-2" + ADMIN_POD=nvflare-admin + WORK_DIR=/tmp/nvflare/openshift-e2e + STOP_CLUSTER=true + DELETE_NAMESPACE=false + DELETE_WORK_DIR=false + +Examples: + bash examples/devops/openshift/scripts/cleanup_openshift_cluster.sh + + DELETE_NAMESPACE=true \ + bash examples/devops/openshift/scripts/cleanup_openshift_cluster.sh +EOF +} + +info() { + echo + echo "==> $*" +} + +warn() { + echo "WARNING: $*" >&2 +} + +is_truthy() { + case "${1:-}" in + 1|[Tt][Rr][Uu][Ee]|[Yy][Ee][Ss]|[Yy]) + return 0 + ;; + *) + return 1 + ;; + esac +} + +require_cmd() { + local cmd + for cmd in "$@"; do + command -v "$cmd" >/dev/null 2>&1 || { + warn "Required command not found: $cmd" + return 1 + } + done +} + +safe_name() { + local name + name="$(printf '%s' "$1" | tr '[:upper:]_' '[:lower:]-' | sed 's/[^a-z0-9-]/-/g; s/-\{1,\}/-/g; s/^-//; s/-$//' | cut -c 1-63 | sed 's/-$//')" + if [[ -z "${name}" ]]; then + name="site" + fi + case "${name}" in + [a-z]*) + ;; + *) + name="site-${name}" + ;; + esac + printf '%s\n' "${name}" +} + +normalize_job_id() { + local name + name="$(printf '%s' "$1" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]//g')" + case "${name}" in + [0-9]*) + name="j${name}" + ;; + esac + printf '%s\n' "$(printf '%s' "${name}" | cut -c 1-63 | sed 's/-$//')" +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --delete-namespace) + DELETE_NAMESPACE=true + ;; + --keep-namespace) + DELETE_NAMESPACE=false + ;; + --no-stop) + STOP_CLUSTER=false + ;; + --delete-work-dir) + DELETE_WORK_DIR=true + ;; + *) + warn "Unknown argument: $1" + usage + exit 1 + ;; + esac + shift + done +} + +namespace_exists() { + "${KUBE_CMD}" get namespace "${NAMESPACE}" >/dev/null 2>&1 +} + +delete_namespace() { + if ! namespace_exists; then + info "Namespace ${NAMESPACE} does not exist" + return + fi + + info "Deleting namespace ${NAMESPACE}" + "${KUBE_CMD}" delete namespace "${NAMESPACE}" --ignore-not-found=true +} + +delete_known_job_pods() { + local job_id_file="${WORK_DIR}/last_job_id" + local raw_job_id="" + local normalized_job_id="" + + if [[ ! -f "${job_id_file}" ]]; then + return + fi + + raw_job_id="$(sed -n '1p' "${job_id_file}")" + [[ -n "${raw_job_id}" ]] || return + normalized_job_id="$(normalize_job_id "${raw_job_id}")" + "${KUBE_CMD}" -n "${NAMESPACE}" get pods -o name 2>/dev/null \ + | while IFS= read -r pod; do + case "${pod}" in + pod/"${normalized_job_id}"*) + "${KUBE_CMD}" -n "${NAMESPACE}" delete "${pod}" --ignore-not-found=true + ;; + esac + done +} + +delete_generated_resources() { + local participant + local safe + + if ! namespace_exists; then + info "Namespace ${NAMESPACE} does not exist" + return + fi + + info "Uninstalling Helm releases" + for participant in ${SERVER_NAME} ${CLIENTS}; do + "${HELM_BIN}" uninstall "${participant}" -n "${NAMESPACE}" >/dev/null 2>&1 || true + done + + info "Deleting temporary pods and last submitted job pods" + "${KUBE_CMD}" -n "${NAMESPACE}" delete pod "${ADMIN_POD}" --ignore-not-found=true >/dev/null 2>&1 || true + for participant in ${SERVER_NAME} ${CLIENTS}; do + safe="$(safe_name "${participant}")" + "${KUBE_CMD}" -n "${NAMESPACE}" delete pod "nvflare-copy-${safe}" --ignore-not-found=true >/dev/null 2>&1 || true + done + delete_known_job_pods + + info "Deleting generated workspace PVCs" + for participant in ${SERVER_NAME} ${CLIENTS}; do + safe="$(safe_name "${participant}")" + "${KUBE_CMD}" -n "${NAMESPACE}" delete pvc "nvflare-ws-${safe}" --ignore-not-found=true >/dev/null 2>&1 || true + done +} + +delete_work_dir() { + if ! is_truthy "${DELETE_WORK_DIR}"; then + return + fi + + case "${WORK_DIR}" in + /tmp/nvflare/*) + info "Deleting work directory ${WORK_DIR}" + rm -rf "${WORK_DIR}" + ;; + *) + warn "Refusing to delete WORK_DIR outside /tmp/nvflare: ${WORK_DIR}" + ;; + esac +} + +stop_cluster() { + if ! is_truthy "${STOP_CLUSTER}"; then + return + fi + + if ! command -v "${CRC_BIN}" >/dev/null 2>&1; then + warn "CRC command not found: ${CRC_BIN}; skipping cluster stop" + return + fi + + info "Stopping OpenShift Local" + "${CRC_BIN}" stop +} + +KUBE_CMD="${KUBE_CMD:-oc}" +HELM_BIN="${HELM_BIN:-helm}" +CRC_BIN="${CRC_BIN:-crc}" +NAMESPACE="${NAMESPACE:-nvflare-e2e}" +SERVER_NAME="${SERVER_NAME:-nvflare-server}" +CLIENTS="${CLIENTS:-site-1 site-2}" +ADMIN_POD="${ADMIN_POD:-nvflare-admin}" +WORK_DIR="${WORK_DIR:-/tmp/nvflare/openshift-e2e}" +STOP_CLUSTER="${STOP_CLUSTER:-true}" +DELETE_NAMESPACE="${DELETE_NAMESPACE:-false}" +DELETE_WORK_DIR="${DELETE_WORK_DIR:-false}" + +parse_args "$@" + +if is_truthy "${DELETE_NAMESPACE}"; then + if require_cmd "${KUBE_CMD}"; then + delete_namespace + else + warn "Skipping namespace cleanup because the Kubernetes CLI is missing" + fi +else + if require_cmd "${KUBE_CMD}" "${HELM_BIN}"; then + delete_generated_resources + else + warn "Skipping resource cleanup because required Kubernetes tools are missing" + fi +fi + +delete_work_dir +stop_cluster diff --git a/examples/devops/openshift/scripts/create_openshift_cluster.sh b/examples/devops/openshift/scripts/create_openshift_cluster.sh new file mode 100755 index 0000000000..7ba7d75075 --- /dev/null +++ b/examples/devops/openshift/scripts/create_openshift_cluster.sh @@ -0,0 +1,180 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Configure and create a local single-node OpenShift cluster for NVFlare testing +with Red Hat OpenShift Local (CRC). + +OpenShift Local creates the cluster VM during the first `crc start`. This script +sets the CRC configuration, runs `crc setup`, and starts the cluster by default +through start_openshift_cluster.sh. + +Usage: + bash create_openshift_cluster.sh [--setup-only] + +Required for the first non-interactive start: + PULL_SECRET_FILE Path to the OpenShift pull secret downloaded from + https://console.redhat.com/openshift/create/local + +Common environment: + CRC_BIN=crc + CRC_PRESET=openshift + CRC_CPUS=6 + CRC_MEMORY=24576 + CRC_DISK_SIZE=120 + CRC_ENABLE_CLUSTER_MONITORING=false + CRC_BUNDLE= + CRC_HTTP_PROXY= + CRC_HTTPS_PROXY= + CRC_NO_PROXY= + CRC_ENABLE_SHARED_DIRS=false + START_AFTER_CREATE=true + +Environment passed to start_openshift_cluster.sh: + OC_BIN=oc + NAMESPACE=nvflare-e2e + CREATE_PROJECT=true + LOGIN_OPENSHIFT=true + OPENSHIFT_USER=developer + OPENSHIFT_PASSWORD=developer + OPENSHIFT_API_URL=https://api.crc.testing:6443 + +Examples: + PULL_SECRET_FILE=$HOME/Downloads/pull-secret.txt \ + bash examples/devops/openshift/scripts/create_openshift_cluster.sh + + CRC_CPUS=8 CRC_MEMORY=32768 CRC_DISK_SIZE=160 \ + PULL_SECRET_FILE=$HOME/Downloads/pull-secret.txt \ + bash examples/devops/openshift/scripts/create_openshift_cluster.sh + + START_AFTER_CREATE=false \ + bash examples/devops/openshift/scripts/create_openshift_cluster.sh +EOF +} + +fail() { + echo "ERROR: $*" >&2 + exit 1 +} + +info() { + echo + echo "==> $*" +} + +require_cmd() { + local cmd + for cmd in "$@"; do + command -v "$cmd" >/dev/null 2>&1 || fail "Required command not found: $cmd" + done +} + +is_truthy() { + case "${1:-}" in + 1|[Tt][Rr][Uu][Ee]|[Yy][Ee][Ss]|[Yy]) + return 0 + ;; + *) + return 1 + ;; + esac +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --setup-only) + START_AFTER_CREATE=false + ;; + *) + fail "Unknown argument: $1" + ;; + esac + shift + done +} + +crc_config_set() { + local key=$1 + local value=$2 + + [[ -n "${value}" ]] || return 0 + info "Setting crc config ${key}=${value}" + "${CRC_BIN}" config set "${key}" "${value}" +} + +validate_pull_secret_for_noninteractive_start() { + if ! is_truthy "${START_AFTER_CREATE}"; then + return + fi + + if [[ -n "${PULL_SECRET_FILE}" ]]; then + [[ -r "${PULL_SECRET_FILE}" ]] || fail "PULL_SECRET_FILE is not readable: ${PULL_SECRET_FILE}" + return + fi + + [[ -t 0 ]] || fail "Set PULL_SECRET_FILE for non-interactive cluster creation, or set START_AFTER_CREATE=false." +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +CRC_BIN="${CRC_BIN:-crc}" +CRC_PRESET="${CRC_PRESET:-openshift}" +CRC_CPUS="${CRC_CPUS:-6}" +CRC_MEMORY="${CRC_MEMORY:-24576}" +CRC_DISK_SIZE="${CRC_DISK_SIZE:-120}" +CRC_ENABLE_CLUSTER_MONITORING="${CRC_ENABLE_CLUSTER_MONITORING:-false}" +CRC_BUNDLE="${CRC_BUNDLE:-}" +CRC_HTTP_PROXY="${CRC_HTTP_PROXY:-}" +CRC_HTTPS_PROXY="${CRC_HTTPS_PROXY:-}" +CRC_NO_PROXY="${CRC_NO_PROXY:-}" +CRC_ENABLE_SHARED_DIRS="${CRC_ENABLE_SHARED_DIRS:-false}" +PULL_SECRET_FILE="${PULL_SECRET_FILE:-}" +START_AFTER_CREATE="${START_AFTER_CREATE:-true}" + +parse_args "$@" +require_cmd "${CRC_BIN}" +validate_pull_secret_for_noninteractive_start + +if [[ -n "${CRC_HOME_DIR:-}" ]]; then + info "Ignoring CRC_HOME_DIR=${CRC_HOME_DIR}; use CRC_ENABLE_SHARED_DIRS=false for nonstandard home paths." +fi + +info "Configuring OpenShift Local" +crc_config_set preset "${CRC_PRESET}" +crc_config_set cpus "${CRC_CPUS}" +crc_config_set memory "${CRC_MEMORY}" +crc_config_set disk-size "${CRC_DISK_SIZE}" +crc_config_set enable-cluster-monitoring "${CRC_ENABLE_CLUSTER_MONITORING}" +crc_config_set bundle "${CRC_BUNDLE}" +crc_config_set http-proxy "${CRC_HTTP_PROXY}" +crc_config_set https-proxy "${CRC_HTTPS_PROXY}" +crc_config_set no-proxy "${CRC_NO_PROXY}" +crc_config_set enable-shared-dirs "${CRC_ENABLE_SHARED_DIRS}" + +info "Running crc setup" +"${CRC_BIN}" setup + +if is_truthy "${START_AFTER_CREATE}"; then + info "Starting OpenShift Local" + bash "${SCRIPT_DIR}/start_openshift_cluster.sh" +else + cat <&2 + exit 1 +} + +info() { + echo + echo "==> $*" +} + +require_cmd() { + local cmd + for cmd in "$@"; do + command -v "$cmd" >/dev/null 2>&1 || fail "Required command not found: $cmd" + done +} + +is_truthy() { + case "${1:-}" in + 1|[Tt][Rr][Uu][Ee]|[Yy][Ee][Ss]|[Yy]) + return 0 + ;; + *) + return 1 + ;; + esac +} + +safe_name() { + python3 - "$1" <<'PY' +import re +import sys + +name = re.sub(r"[^a-z0-9-]", "-", sys.argv[1].lower()) +name = re.sub(r"-+", "-", name).strip("-") or "site" +if not name[0].isalpha(): + name = f"site-{name}" +print(name[:63].rstrip("-")) +PY +} + +json_data_field() { + local field=$1 + python3 -c ' +import json +import sys + +field = sys.argv[1] +payload = json.load(sys.stdin) +data = payload.get("data") or {} +value = data +for part in field.split("."): + value = value[part] +print(value) +' "$field" +} + +normalize_job_id() { + python3 - "$1" <<'PY' +import re +import sys + +name = sys.argv[1].lower() +name = re.sub(r"[^a-z0-9-]", "", name) +if name and name[0].isdigit(): + name = "j" + name +print(name[:63].rstrip("-")) +PY +} + +append_yaml_secret_list() { + local file=$1 + local indent=$2 + local key=$3 + local values=$4 + local name + + [[ -n "${values}" ]] || return 0 + printf "%*s%s:\n" "${indent}" "" "${key}" >>"${file}" + for name in ${values}; do + printf "%*s- %s\n" "$((indent + 2))" "" "${name}" >>"${file}" + done +} + +append_k8s_image_pull_secrets() { + local file=$1 + local indent=$2 + local values=$3 + local name + + [[ -n "${values}" ]] || return 0 + printf "%*simagePullSecrets:\n" "${indent}" "" >>"${file}" + for name in ${values}; do + printf "%*s- name: %s\n" "$((indent + 2))" "" "${name}" >>"${file}" + done +} + +init_k8s_env() { + local require_image=${1:-false} + + SCRIPT_DIR="${SCRIPT_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}" + REPO_ROOT="${REPO_ROOT:-$(cd "${SCRIPT_DIR}/../../../../../.." && pwd)}" + + KUBE_CMD="${KUBE_CMD:-oc}" + PROJECT_NAME="${PROJECT_NAME:-openshift_nvflare_e2e}" + NAMESPACE="${NAMESPACE:-nvflare-e2e}" + SERVER_NAME="${SERVER_NAME:-nvflare-server}" + SERVER_SERVICE_NAME="${SERVER_SERVICE_NAME:-${SERVER_NAME}}" + SERVER_HOST="${SERVER_HOST:-${SERVER_SERVICE_NAME}}" + CLIENTS="${CLIENTS:-site-1 site-2}" + ADMIN_USER="${ADMIN_USER:-admin@nvidia.com}" + ADMIN_ROLE="${ADMIN_ROLE:-lead}" + ORG="${ORG:-nvidia}" + FED_LEARN_PORT="${FED_LEARN_PORT:-8002}" + ADMIN_PORT="${ADMIN_PORT:-8003}" + PARENT_PORT="${PARENT_PORT:-8102}" + WORKSPACE_MOUNT_PATH="${WORKSPACE_MOUNT_PATH:-/var/tmp/nvflare/workspace}" + PARENT_PYTHON_PATH="${PARENT_PYTHON_PATH:-python}" + PARENT_CPU="${PARENT_CPU:-}" + PARENT_MEMORY="${PARENT_MEMORY:-}" + ADMIN_PYTHON_PATH="${ADMIN_PYTHON_PATH:-${PARENT_PYTHON_PATH}}" + JOB_PYTHON_PATH="${JOB_PYTHON_PATH:-/usr/local/bin/python3}" + JOB_PENDING_TIMEOUT="${JOB_PENDING_TIMEOUT:-300}" + WORKSPACE_STORAGE="${WORKSPACE_STORAGE:-2Gi}" + STORAGE_CLASS="${STORAGE_CLASS:-}" + WORK_DIR="${WORK_DIR:-/tmp/nvflare/openshift-e2e}" + CLEAN_WORK_DIR="${CLEAN_WORK_DIR:-false}" + ALLOW_DELETE_OUTSIDE_TMP="${ALLOW_DELETE_OUTSIDE_TMP:-false}" + DELETE_NAMESPACE_ON_EXIT="${DELETE_NAMESPACE_ON_EXIT:-false}" + DELETE_ADMIN_POD_ON_EXIT="${DELETE_ADMIN_POD_ON_EXIT:-false}" + POD_READY_TIMEOUT="${POD_READY_TIMEOUT:-180s}" + ROLLOUT_TIMEOUT="${ROLLOUT_TIMEOUT:-300s}" + JOB_WAIT_TIMEOUT="${JOB_WAIT_TIMEOUT:-900}" + JOB_WAIT_INTERVAL="${JOB_WAIT_INTERVAL:-5}" + JOB_POD_APPEAR_TIMEOUT="${JOB_POD_APPEAR_TIMEOUT:-180}" + NVFLARE_CONNECT_TIMEOUT="${NVFLARE_CONNECT_TIMEOUT:-10}" + NUM_ROUNDS="${NUM_ROUNDS:-1}" + COPY_IMAGE="${COPY_IMAGE:-busybox:1.36}" + PARENT_IMAGE_PULL_SECRETS="${PARENT_IMAGE_PULL_SECRETS:-}" + JOB_IMAGE_PULL_SECRETS="${JOB_IMAGE_PULL_SECRETS:-${PARENT_IMAGE_PULL_SECRETS}}" + JOB_CPU="${JOB_CPU:-}" + JOB_MEMORY="${JOB_MEMORY:-}" + JOB_EPHEMERAL_STORAGE="${JOB_EPHEMERAL_STORAGE:-1Gi}" + SUBMIT_TOKEN="${SUBMIT_TOKEN:-openshift-e2e-$(date +%Y%m%d%H%M%S)}" + + if [[ "${require_image}" == "true" ]]; then + : "${IMAGE:?Set IMAGE to a cluster-pullable NVFlare image before running this script.}" + fi + JOB_IMAGE="${JOB_IMAGE:-${IMAGE:-}}" + ADMIN_IMAGE="${ADMIN_IMAGE:-${IMAGE:-}}" + + read -r -a CLIENT_ARRAY <<<"${CLIENTS}" + CLIENT_COUNT="${#CLIENT_ARRAY[@]}" + ((CLIENT_COUNT > 0)) || fail "CLIENTS must contain at least one client site" + + PARTICIPANTS=("${SERVER_NAME}" "${CLIENT_ARRAY[@]}" "${ADMIN_USER}") + RUNTIME_PARTICIPANTS=("${SERVER_NAME}" "${CLIENT_ARRAY[@]}") + + PROJECT_FILE="${WORK_DIR}/project.yml" + PACKAGE_WORKSPACE="${WORK_DIR}/workspace" + PREPARED_DIR="${WORK_DIR}/prepared" + PREPARE_CONFIG_DIR="${WORK_DIR}/prepare-configs" + JOB_DIR="${WORK_DIR}/jobs/hello-numpy-k8s" + ADMIN_POD="${ADMIN_POD:-nvflare-admin}" + ADMIN_POD_FILE="${WORK_DIR}/admin-pod.yaml" + PROD_DIR="${PACKAGE_WORKSPACE}/${PROJECT_NAME}/prod_00" + LAST_JOB_ID_FILE="${WORK_DIR}/last_job_id" + + [[ -d "${REPO_ROOT}" ]] || fail "REPO_ROOT does not exist: ${REPO_ROOT}" +} + +clean_work_dir_if_requested() { + if [[ "${CLEAN_WORK_DIR}" != "true" ]]; then + return + fi + + case "${WORK_DIR}" in + /tmp/nvflare/*) + rm -rf "${WORK_DIR}" + ;; + *) + [[ "${ALLOW_DELETE_OUTSIDE_TMP}" == "true" ]] || { + fail "Refusing to delete WORK_DIR outside /tmp/nvflare: ${WORK_DIR}. Set ALLOW_DELETE_OUTSIDE_TMP=true to allow." + } + rm -rf "${WORK_DIR}" + ;; + esac +} + +ensure_work_dirs() { + mkdir -p "${PACKAGE_WORKSPACE}" "${PREPARED_DIR}" "${PREPARE_CONFIG_DIR}" "$(dirname "${JOB_DIR}")" +} + +require_provisioned_workspace() { + local participant + + [[ -d "${PROD_DIR}" ]] || fail "Provisioned prod dir not found: ${PROD_DIR}. Run k8s_provision.sh first." + for participant in "${PARTICIPANTS[@]}"; do + [[ -d "${PROD_DIR}/${participant}" ]] || fail "Missing provisioned participant folder: ${PROD_DIR}/${participant}" + done +} + +write_project_file() { + local participant + local host + local seen_hosts="" + + cat >"${PROJECT_FILE}" <>"${PROJECT_FILE}" <>"${PROJECT_FILE}" <>"${PROJECT_FILE}" <>"${PROJECT_FILE}" <"${file}" <>"${file}" <>"${file}" <>"${file}" <>"${file}" <>"${file}" <>"${file}" <>"${file}" </dev/null 2>&1; then + return + fi + + if [[ "${KUBE_CMD}" == "oc" ]]; then + "${KUBE_CMD}" new-project "${NAMESPACE}" >/dev/null + else + "${KUBE_CMD}" create namespace "${NAMESPACE}" >/dev/null + fi +} + +write_pvc_manifest() { + local file=$1 + local pvc=$2 + local storage_class_line="" + + if [[ -n "${STORAGE_CLASS}" ]]; then + storage_class_line=" storageClassName: ${STORAGE_CLASS}" + fi + + cat >"${file}" </dev/null || true)" + if [[ "${phase}" == "Bound" ]]; then + return 0 + fi + sleep 1 + done + + "${KUBE_CMD}" -n "${NAMESPACE}" describe pvc "${pvc}" || true + fail "PVC ${pvc} did not become Bound; last phase=${phase:-unknown}" +} + +write_copy_pod_manifest() { + local file=$1 + local pod=$2 + local pvc=$3 + + cat >"${file}" </dev/null + "${KUBE_CMD}" -n "${NAMESPACE}" apply -f "${pod_file}" >/dev/null + wait_for_pvc_bound "${pvc}" 180 + "${KUBE_CMD}" -n "${NAMESPACE}" wait --for=condition=Ready "pod/${pod}" --timeout="${POD_READY_TIMEOUT}" + + "${KUBE_CMD}" -n "${NAMESPACE}" exec "${pod}" -- rm -rf /mnt/nvflare-workspace/startup /mnt/nvflare-workspace/local + "${KUBE_CMD}" -n "${NAMESPACE}" exec "${pod}" -- mkdir -p /mnt/nvflare-workspace/startup /mnt/nvflare-workspace/local + "${KUBE_CMD}" -n "${NAMESPACE}" cp "${PREPARED_DIR}/${participant}/startup/." "${pod}:/mnt/nvflare-workspace/startup" + "${KUBE_CMD}" -n "${NAMESPACE}" cp "${PREPARED_DIR}/${participant}/local/." "${pod}:/mnt/nvflare-workspace/local" + "${KUBE_CMD}" -n "${NAMESPACE}" exec "${pod}" -- ls -la /mnt/nvflare-workspace/startup /mnt/nvflare-workspace/local + "${KUBE_CMD}" -n "${NAMESPACE}" delete pod "${pod}" --ignore-not-found=true >/dev/null +} + +install_chart() { + local participant=$1 + local deployment_existed=false + + if "${KUBE_CMD}" -n "${NAMESPACE}" get "deployment/${participant}" >/dev/null 2>&1; then + deployment_existed=true + fi + helm upgrade --install "${participant}" "${PREPARED_DIR}/${participant}/helm_chart" --namespace "${NAMESPACE}" + if [[ "${deployment_existed}" == "true" ]]; then + "${KUBE_CMD}" -n "${NAMESPACE}" rollout restart "deployment/${participant}" + fi + "${KUBE_CMD}" -n "${NAMESPACE}" rollout status "deployment/${participant}" --timeout="${ROLLOUT_TIMEOUT}" +} + +verify_runtime_deployments() { + local participant + + for participant in "${RUNTIME_PARTICIPANTS[@]}"; do + "${KUBE_CMD}" -n "${NAMESPACE}" rollout status "deployment/${participant}" --timeout="${ROLLOUT_TIMEOUT}" + done +} + +verify_parent_kubernetes_client() { + local participant=$1 + + "${KUBE_CMD}" -n "${NAMESPACE}" exec "deploy/${participant}" -- "${PARENT_PYTHON_PATH}" -c ' +import kubernetes + +print(f"kubernetes-python-client={kubernetes.__version__}") +' +} + +export_hello_numpy_job() { + local job_dir=$1 + local job_parent + + job_parent="$(dirname "${job_dir}")" + rm -rf "${job_dir}" + mkdir -p "${job_parent}" + python3 - "${REPO_ROOT}" "${job_parent}" "${CLIENT_COUNT}" "${NUM_ROUNDS}" <<'PY' +import os +import pathlib +import sys + +repo_root, job_parent, client_count, num_rounds = sys.argv[1:5] +example_dir = os.path.join(repo_root, "examples", "hello-world", "hello-numpy") +sys.path.insert(0, repo_root) +os.chdir(example_dir) + +from nvflare.app_common.np.recipes.fedavg import NumpyFedAvgRecipe +from nvflare.client.config import TransferType + +recipe = NumpyFedAvgRecipe( + name="hello-numpy-k8s", + min_clients=int(client_count), + num_rounds=int(num_rounds), + model=[[1, 2, 3], [4, 5, 6], [7, 8, 9]], + train_script="client.py", + train_args="--update_type full", + launch_external_process=False, + params_transfer_type=TransferType.FULL, +) +recipe.export(job_parent) +job_dir = pathlib.Path(job_parent) / "hello-numpy-k8s" +if not (job_dir / "meta.json").is_file(): + raise SystemExit(f"expected exported job at {job_dir}") +PY +} + +patch_job_launcher_spec() { + local job_dir=$1 + + python3 - "${job_dir}" "${JOB_IMAGE}" "${JOB_PYTHON_PATH}" "${JOB_CPU}" "${JOB_MEMORY}" "${JOB_EPHEMERAL_STORAGE}" <<'PY' +import json +import pathlib +import sys + +job_dir, image, python_path, cpu, memory, ephemeral_storage = sys.argv[1:7] +meta_path = pathlib.Path(job_dir) / "meta.json" +with meta_path.open("r", encoding="utf-8") as f: + meta = json.load(f) + +k8s_spec = { + "image": image, + "python_path": python_path, + "ephemeral_storage": ephemeral_storage, +} +if cpu: + k8s_spec["cpu"] = cpu +if memory: + k8s_spec["memory"] = memory + +meta.setdefault("launcher_spec", {}) +meta["launcher_spec"].setdefault("default", {}) +meta["launcher_spec"]["default"]["k8s"] = k8s_spec + +with meta_path.open("w", encoding="utf-8") as f: + json.dump(meta, f, indent=2) + f.write("\n") +PY +} + +write_admin_pod_manifest() { + cat >"${ADMIN_POD_FILE}" </dev/null + write_admin_pod_manifest + "${KUBE_CMD}" -n "${NAMESPACE}" apply -f "${ADMIN_POD_FILE}" >/dev/null + "${KUBE_CMD}" -n "${NAMESPACE}" wait --for=condition=Ready "pod/${ADMIN_POD}" --timeout="${POD_READY_TIMEOUT}" + copy_dir_to_admin_pod "${PROD_DIR}/${ADMIN_USER}" /workspace/admin + copy_dir_to_admin_pod "${JOB_DIR}" /workspace/job + admin_pod_file_exists /workspace/admin/startup/fed_admin.json + admin_pod_file_exists /workspace/job/meta.json +} + +wait_for_job_pods() { + local normalized_job_id=$1 + local min_count=$2 + local timeout_seconds=$3 + local count + local i + + for ((i = 0; i < timeout_seconds; i++)); do + count="$("${KUBE_CMD}" -n "${NAMESPACE}" get pods -o json | python3 -c ' +import json +import sys + +prefix = sys.argv[1] +items = json.load(sys.stdin).get("items") or [] +matches = [item for item in items if item.get("metadata", {}).get("name", "").startswith(prefix)] +print(len(matches)) +' "${normalized_job_id}")" + if ((count >= min_count)); then + "${KUBE_CMD}" -n "${NAMESPACE}" get pods | grep "^${normalized_job_id}" || true + return 0 + fi + sleep 1 + done + + "${KUBE_CMD}" -n "${NAMESPACE}" get pods + fail "Expected at least ${min_count} K8s launcher job pods with prefix ${normalized_job_id}; found ${count:-0}" +} + +submit_and_wait_for_job() { + local submit_out + local wait_out + local job_id + local normalized_job_id + local min_job_pods + local job_status + + submit_out="$("${KUBE_CMD}" -n "${NAMESPACE}" exec "${ADMIN_POD}" -- \ + "${ADMIN_PYTHON_PATH}" -m nvflare.cli --format json --connect-timeout "${NVFLARE_CONNECT_TIMEOUT}" \ + job submit -j /workspace/job --startup-kit /workspace/admin --submit-token "${SUBMIT_TOKEN}")" + job_id="$(printf '%s' "${submit_out}" | json_data_field job_id)" + [[ -n "${job_id}" ]] || fail "Job submission did not return a job_id: ${submit_out}" + mkdir -p "$(dirname "${LAST_JOB_ID_FILE}")" + printf "%s\n" "${job_id}" >"${LAST_JOB_ID_FILE}" + echo "Submitted job_id=${job_id}" + + normalized_job_id="$(normalize_job_id "${job_id}")" + min_job_pods="${MIN_JOB_PODS:-$((CLIENT_COUNT + 1))}" + wait_for_job_pods "${normalized_job_id}" "${min_job_pods}" "${JOB_POD_APPEAR_TIMEOUT}" + + wait_out="$("${KUBE_CMD}" -n "${NAMESPACE}" exec "${ADMIN_POD}" -- \ + "${ADMIN_PYTHON_PATH}" -m nvflare.cli --format json --connect-timeout "${NVFLARE_CONNECT_TIMEOUT}" \ + job wait "${job_id}" --startup-kit /workspace/admin --timeout "${JOB_WAIT_TIMEOUT}" --interval "${JOB_WAIT_INTERVAL}")" + echo "${wait_out}" + job_status="$(printf '%s' "${wait_out}" | json_data_field status)" + [[ "${job_status}" == "FINISHED:COMPLETED" ]] || fail "Job ${job_id} finished with status ${job_status}" +} + +report_missing_pieces() { + cat </dev/null 2>&1 || true + elif is_truthy "${DELETE_ADMIN_POD_ON_EXIT}"; then + "${KUBE_CMD}" -n "${NAMESPACE}" delete pod "${ADMIN_POD}" --ignore-not-found=true >/dev/null 2>&1 || true + fi +} + +run_provision_phase() { + require_cmd nvflare python3 + clean_work_dir_if_requested + ensure_work_dirs + + info "Writing nvflare provision project file" + write_project_file + + info "Running nvflare provision" + nvflare provision -p "${PROJECT_FILE}" -w "${PACKAGE_WORKSPACE}" --force + [[ -d "${PROD_DIR}" ]] || fail "Expected packaged prod dir not found: ${PROD_DIR}" + + info "Provisioned startup kits" + for participant_dir in "${PROD_DIR}"/*; do + [[ -d "${participant_dir}" ]] || continue + basename "${participant_dir}" + done | sort +} + +run_deploy_phase() { + local participant + local pvc + local cfg + local pvc_file + + require_cmd "${KUBE_CMD}" helm nvflare python3 tar + require_provisioned_workspace + ensure_work_dirs + + info "Preparing K8s deployment kits with the built-in K8s launcher" + for participant in "${RUNTIME_PARTICIPANTS[@]}"; do + pvc="nvflare-ws-$(safe_name "${participant}")" + cfg="$(write_prepare_config "${participant}" "${pvc}")" + nvflare deploy prepare "${PROD_DIR}/${participant}" --output "${PREPARED_DIR}/${participant}" --config "${cfg}" + normalize_prepared_resources "${participant}" + done + verify_prepared_launcher "${SERVER_NAME}" server + for participant in "${CLIENT_ARRAY[@]}"; do + verify_prepared_launcher "${participant}" client + done + + info "Creating namespace and staging workspace PVCs" + create_namespace + "${KUBE_CMD}" -n "${NAMESPACE}" get serviceaccount default >/dev/null + for participant in "${RUNTIME_PARTICIPANTS[@]}"; do + pvc="nvflare-ws-$(safe_name "${participant}")" + pvc_file="${WORK_DIR}/pvc-${participant}.yaml" + write_pvc_manifest "${pvc_file}" "${pvc}" + "${KUBE_CMD}" -n "${NAMESPACE}" apply -f "${pvc_file}" + stage_workspace_pvc "${participant}" "${pvc}" + done + + info "Installing generated Helm charts" + install_chart "${SERVER_NAME}" + for participant in "${CLIENT_ARRAY[@]}"; do + install_chart "${participant}" + done + verify_runtime_deployments + info "Verifying parent images include the Kubernetes Python client" + for participant in "${RUNTIME_PARTICIPANTS[@]}"; do + verify_parent_kubernetes_client "${participant}" + done + "${KUBE_CMD}" -n "${NAMESPACE}" get pods,svc,pvc +} + +run_submit_job_phase() { + require_cmd "${KUBE_CMD}" nvflare python3 tar + require_provisioned_workspace + [[ -n "${ADMIN_IMAGE}" ]] || fail "Set IMAGE or ADMIN_IMAGE before running this script." + [[ -n "${JOB_IMAGE}" ]] || fail "Set IMAGE or JOB_IMAGE before running this script." + verify_runtime_deployments + + info "Exporting hello-numpy job and configuring launcher_spec.default.k8s" + export_hello_numpy_job "${JOB_DIR}" + patch_job_launcher_spec "${JOB_DIR}" + python3 -m json.tool "${JOB_DIR}/meta.json" >/dev/null + + info "Submitting job from an in-cluster admin pod" + prepare_admin_pod + submit_and_wait_for_job + + info "Job workflow completed successfully" + "${KUBE_CMD}" -n "${NAMESPACE}" get pods + report_missing_pieces +} diff --git a/examples/devops/openshift/scripts/k8s_deploy.sh b/examples/devops/openshift/scripts/k8s_deploy.sh new file mode 100755 index 0000000000..67aabde71e --- /dev/null +++ b/examples/devops/openshift/scripts/k8s_deploy.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Prepare NVFlare K8s startup kits, deploy them into OpenShift/Kubernetes, and +verify the parent server/client pods are running. + +Run k8s_provision.sh before this script. + +Required environment: + IMAGE Parent container image pullable by the cluster. It must contain this + NVFlare version with the K8S extra/Kubernetes Python client and the + Python executable named by PARENT_PYTHON_PATH. + +Common optional environment: + KUBE_CMD=oc + NAMESPACE=nvflare-e2e + PROJECT_NAME=openshift_nvflare_e2e + SERVER_NAME=nvflare-server + CLIENTS="site-1 site-2" + WORK_DIR=/tmp/nvflare/openshift-e2e + PARENT_PYTHON_PATH=python + STORAGE_CLASS= + WORKSPACE_STORAGE=2Gi + COPY_IMAGE=busybox:1.36 # must contain sh, sleep, and tar for oc cp + PARENT_CPU= + PARENT_MEMORY= + PARENT_IMAGE_PULL_SECRETS="registry-secret another-secret" + JOB_IMAGE_PULL_SECRETS="registry-secret another-secret" + +Example: + IMAGE=registry.example.com/nvflare-parent:dev \ + bash examples/devops/openshift/scripts/k8s_deploy.sh +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s_common.sh +source "${SCRIPT_DIR}/k8s_common.sh" + +init_k8s_env true +run_deploy_phase diff --git a/examples/devops/openshift/scripts/k8s_e2e.sh b/examples/devops/openshift/scripts/k8s_e2e.sh new file mode 100755 index 0000000000..fd224c0c39 --- /dev/null +++ b/examples/devops/openshift/scripts/k8s_e2e.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Run the complete OpenShift/Kubernetes NVFlare e2e workflow by invoking the +three phase scripts in order: + + 1. k8s_provision.sh + 2. k8s_deploy.sh + 3. k8s_submit_job.sh + +Required environment: + IMAGE Parent container image pullable by the cluster. It must contain this + NVFlare version with the K8S extra/Kubernetes Python client and the + Python executable named by PARENT_PYTHON_PATH. + + JOB_IMAGE is required when IMAGE is parent-only, such as an image built from + docker/Dockerfile.parent. ADMIN_IMAGE defaults to IMAGE and can use the + parent image. JOB_IMAGE must contain NVFlare, Python, numpy, and the + runtime tools needed by the job. + +Common optional environment: + KUBE_CMD=oc + NAMESPACE=nvflare-e2e + PROJECT_NAME=openshift_nvflare_e2e + SERVER_NAME=nvflare-server + SERVER_HOST=nvflare-server + CLIENTS="site-1 site-2" + ADMIN_USER=admin@nvidia.com + WORK_DIR=/tmp/nvflare/openshift-e2e + PARENT_PYTHON_PATH=python + ADMIN_PYTHON_PATH=python + STORAGE_CLASS= + WORKSPACE_STORAGE=2Gi + COPY_IMAGE=busybox:1.36 # must contain sh, sleep, and tar for oc cp + PARENT_CPU= + PARENT_MEMORY= + ADMIN_IMAGE=$IMAGE + JOB_IMAGE=$IMAGE + JOB_WAIT_TIMEOUT=900 + CLEAN_WORK_DIR=true + +Examples: + IMAGE=registry.example.com/nvflare-parent:dev \ + JOB_IMAGE=registry.example.com/nvflare-job:dev \ + bash examples/devops/openshift/scripts/k8s_e2e.sh + + IMAGE=registry.example.com/nvflare-parent:dev \ + JOB_IMAGE=registry.example.com/nvflare-job:dev \ + PARENT_CPU=500m \ + PARENT_MEMORY=1Gi \ + bash examples/devops/openshift/scripts/k8s_e2e.sh +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +CLEAN_WORK_DIR="${CLEAN_WORK_DIR:-true}" bash "${SCRIPT_DIR}/k8s_provision.sh" +CLEAN_WORK_DIR=false bash "${SCRIPT_DIR}/k8s_deploy.sh" +CLEAN_WORK_DIR=false bash "${SCRIPT_DIR}/k8s_submit_job.sh" diff --git a/examples/devops/openshift/scripts/k8s_provision.sh b/examples/devops/openshift/scripts/k8s_provision.sh new file mode 100755 index 0000000000..742b288900 --- /dev/null +++ b/examples/devops/openshift/scripts/k8s_provision.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Generate NVFlare OpenShift e2e startup kits for: + - server: nvflare-server + - clients: site-1 site-2 + - admin: admin@nvidia.com + +This script writes project.yml and runs: + nvflare provision -p project.yml -w --force + +Output defaults to: + /tmp/nvflare/openshift-e2e/workspace//prod_00 + +Common optional environment: + PROJECT_NAME=openshift_nvflare_e2e + SERVER_NAME=nvflare-server + SERVER_HOST=nvflare-server + CLIENTS="site-1 site-2" + ADMIN_USER=admin@nvidia.com + ADMIN_ROLE=lead + ORG=nvidia + WORK_DIR=/tmp/nvflare/openshift-e2e + CLEAN_WORK_DIR=true + +Example: + bash examples/devops/openshift/scripts/k8s_provision.sh +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s_common.sh +source "${SCRIPT_DIR}/k8s_common.sh" + +CLEAN_WORK_DIR="${CLEAN_WORK_DIR:-true}" +init_k8s_env false +run_provision_phase diff --git a/examples/devops/openshift/scripts/k8s_submit_job.sh b/examples/devops/openshift/scripts/k8s_submit_job.sh new file mode 100755 index 0000000000..7662b21f34 --- /dev/null +++ b/examples/devops/openshift/scripts/k8s_submit_job.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Export hello-numpy, submit it from an in-cluster admin pod, verify K8s launcher +job pods are created, and wait for the job to finish successfully. + +Run these first: + 1. k8s_provision.sh + 2. k8s_deploy.sh + +Required environment: + ADMIN_IMAGE or IMAGE Image for the temporary admin pod. It must contain the + NVFlare package and the Python executable named by ADMIN_PYTHON_PATH. + The parent IMAGE can be used as ADMIN_IMAGE. + JOB_IMAGE or IMAGE Image for dynamically created job pods. It must contain + NVFlare, Python, numpy, and the runtime tools needed by the job. + +Common optional environment: + KUBE_CMD=oc + NAMESPACE=nvflare-e2e + PROJECT_NAME=openshift_nvflare_e2e + CLIENTS="site-1 site-2" + ADMIN_USER=admin@nvidia.com + WORK_DIR=/tmp/nvflare/openshift-e2e + ADMIN_IMAGE=$IMAGE + ADMIN_PYTHON_PATH=python + JOB_IMAGE=$IMAGE + JOB_WAIT_TIMEOUT=900 + JOB_POD_APPEAR_TIMEOUT=180 + DELETE_ADMIN_POD_ON_EXIT=false + DELETE_NAMESPACE_ON_EXIT=false + +Example: + IMAGE=registry.example.com/nvflare-parent:dev \ + JOB_IMAGE=registry.example.com/nvflare-job:dev \ + bash examples/devops/openshift/scripts/k8s_submit_job.sh +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s_common.sh +source "${SCRIPT_DIR}/k8s_common.sh" + +init_k8s_env false +trap cleanup_on_exit EXIT +run_submit_job_phase diff --git a/examples/devops/openshift/scripts/k8s_watch.py b/examples/devops/openshift/scripts/k8s_watch.py new file mode 100755 index 0000000000..0a768c26de --- /dev/null +++ b/examples/devops/openshift/scripts/k8s_watch.py @@ -0,0 +1,260 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Rich live pod table for the NVFlare OpenShift deployment scripts.""" + +from __future__ import annotations + +import argparse +import json +import os +import subprocess +import sys +import time +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +try: + from rich.console import Console, Group + from rich.live import Live + from rich.table import Table + from rich.text import Text +except ImportError: + sys.exit("k8s_watch requires the Python package 'rich'. Install it with: python3 -m pip install rich") + + +STATUS_STYLES = { + "Running": "green", + "Succeeded": "dim", + "Completed": "dim", + "Pending": "yellow", + "ContainerCreating": "yellow", + "PodInitializing": "yellow", + "NotReady": "yellow", + "Terminating": "dim yellow", + "Failed": "red", + "Error": "red", + "CrashLoopBackOff": "red", + "ImagePullBackOff": "red", + "ErrImagePull": "red", + "CreateContainerConfigError": "red", + "InvalidImageName": "red", + "Unknown": "red", +} + + +@dataclass +class WatchConfig: + kube_cmd: str + namespace: str + work_dir: Path + last_job_id_file: Path + interval: float + once: bool + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Show a Rich live pod table for NVFlare OpenShift pods.") + parser.add_argument("--once", action="store_true", help="Render one snapshot and exit.") + parser.add_argument("--interval", type=float, default=3.0, help="Refresh interval in seconds. Default: 3.") + return parser.parse_args() + + +def load_config(args: argparse.Namespace) -> WatchConfig: + work_dir = Path(os.environ.get("WORK_DIR", "/tmp/nvflare/openshift-e2e")) + return WatchConfig( + kube_cmd=os.environ.get("KUBE_CMD", "oc"), + namespace=os.environ.get("NAMESPACE", "nvflare-e2e"), + work_dir=work_dir, + last_job_id_file=Path(os.environ.get("LAST_JOB_ID_FILE", str(work_dir / "last_job_id"))), + interval=args.interval, + once=args.once, + ) + + +def run_cmd(cmd: list[str]) -> subprocess.CompletedProcess[str]: + return subprocess.run(cmd, check=False, capture_output=True, text=True) + + +def last_job_id(path: Path) -> str: + if not path.is_file(): + return "" + return path.read_text(encoding="utf-8").strip() + + +def parse_timestamp(value: str | None) -> datetime | None: + if not value: + return None + try: + return datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(timezone.utc) + except ValueError: + return None + + +def age_str(created_at: str | None) -> str: + ts = parse_timestamp(created_at) + if ts is None: + return "-" + seconds = max(0, int((datetime.now(timezone.utc) - ts).total_seconds())) + if seconds < 60: + return f"{seconds}s" + if seconds < 3600: + return f"{seconds // 60}m" + if seconds < 86400: + hours, rem = divmod(seconds, 3600) + minutes = rem // 60 + return f"{hours}h{minutes}m" if minutes else f"{hours}h" + days, rem = divmod(seconds, 86400) + hours = rem // 3600 + return f"{days}d{hours}h" if hours else f"{days}d" + + +def pod_status(pod: dict[str, Any]) -> str: + metadata = pod.get("metadata") or {} + status = pod.get("status") or {} + if metadata.get("deletionTimestamp"): + return "Terminating" + + statuses = (status.get("initContainerStatuses") or []) + (status.get("containerStatuses") or []) + for container_status in statuses: + waiting = (container_status.get("state") or {}).get("waiting") or {} + if waiting.get("reason"): + return waiting["reason"] + + for container_status in status.get("containerStatuses") or []: + terminated = (container_status.get("state") or {}).get("terminated") or {} + reason = terminated.get("reason") + if reason and reason != "Completed": + return reason + + phase = status.get("phase") or "Unknown" + container_statuses = status.get("containerStatuses") or [] + if phase == "Running" and container_statuses and not all(c.get("ready") for c in container_statuses): + return "NotReady" + if phase == "Succeeded": + return "Completed" + return phase + + +def ready_fraction(pod: dict[str, Any]) -> str: + statuses = (pod.get("status") or {}).get("containerStatuses") or [] + if not statuses: + return "0/0" + ready = sum(1 for container_status in statuses if container_status.get("ready")) + return f"{ready}/{len(statuses)}" + + +def restart_count(pod: dict[str, Any]) -> str: + statuses = (pod.get("status") or {}).get("containerStatuses") or [] + return str(sum(int(container_status.get("restartCount") or 0) for container_status in statuses)) + + +def status_text(status: str) -> Text: + return Text(status, style=STATUS_STYLES.get(status, "")) + + +def get_pods(config: WatchConfig) -> tuple[list[dict[str, Any]], str | None]: + ns_result = run_cmd([config.kube_cmd, "get", "namespace", config.namespace, "-o", "json"]) + if ns_result.returncode != 0: + msg = (ns_result.stderr or ns_result.stdout).strip() or f"namespace {config.namespace} not found" + return [], msg + + pod_result = run_cmd( + [ + config.kube_cmd, + "-n", + config.namespace, + "get", + "pods", + "--sort-by=.metadata.creationTimestamp", + "-o", + "json", + ] + ) + if pod_result.returncode != 0: + msg = (pod_result.stderr or pod_result.stdout).strip() or "failed to list pods" + return [], msg + + try: + payload = json.loads(pod_result.stdout) + except json.JSONDecodeError as e: + return [], f"failed to parse pod JSON: {e}" + return payload.get("items") or [], None + + +def build_table(pods: list[dict[str, Any]], error: str | None) -> Table: + table = Table(title="Pods", caption=f"{len(pods)} pods", caption_style="dim", expand=True) + table.add_column("NAME", no_wrap=True) + table.add_column("READY", justify="right", no_wrap=True) + table.add_column("STATUS", no_wrap=True) + table.add_column("RESTARTS", justify="right", no_wrap=True) + table.add_column("AGE", justify="right", no_wrap=True) + + if error: + table.add_row("-", "-", Text(error, style="red"), "-", "-") + return table + + if not pods: + table.add_row(Text("(no pods)", style="dim"), "", "", "", "") + return table + + for pod in pods: + metadata = pod.get("metadata") or {} + status = pod_status(pod) + table.add_row( + metadata.get("name") or "-", + ready_fraction(pod), + status_text(status), + restart_count(pod), + age_str(metadata.get("creationTimestamp")), + ) + return table + + +def build_view(config: WatchConfig) -> Group: + pods, error = get_pods(config) + job_id = last_job_id(config.last_job_id_file) + header = Text("NVFlare OpenShift pods", style="bold") + details = [ + f"Namespace: {config.namespace}", + f"Work dir: {config.work_dir}", + ] + if job_id: + details.append(f"Last job: {job_id}") + details.append(datetime.now(timezone.utc).strftime("UTC: %Y-%m-%dT%H:%M:%SZ")) + return Group(header, Text("\n".join(details), style="dim"), build_table(pods, error)) + + +def main() -> int: + args = parse_args() + if args.interval <= 0: + sys.exit("--interval must be greater than 0") + config = load_config(args) + console = Console() + + if config.once: + console.print(build_view(config)) + return 0 + + with Live(build_view(config), console=console, refresh_per_second=4, transient=False) as live: + while True: + time.sleep(config.interval) + live.update(build_view(config)) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/examples/devops/openshift/scripts/k8s_watch.sh b/examples/devops/openshift/scripts/k8s_watch.sh new file mode 100755 index 0000000000..c0b603e48d --- /dev/null +++ b/examples/devops/openshift/scripts/k8s_watch.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Show an in-place live Rich pod table for OpenShift/Kubernetes pods created by +the NVFlare OpenShift scripts. + +Usage: + bash k8s_watch.sh [--once] [--interval SECONDS] + +Required local Python package: + rich + +Common optional environment: + KUBE_CMD=oc + NAMESPACE=nvflare-e2e + WORK_DIR=/tmp/nvflare/openshift-e2e + +Examples: + bash examples/devops/openshift/scripts/k8s_watch.sh + bash examples/devops/openshift/scripts/k8s_watch.sh --once +EOF +} + +if [[ "${1:-}" == "-h" || "${1:-}" == "--help" ]]; then + usage + exit 0 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=k8s_common.sh +source "${SCRIPT_DIR}/k8s_common.sh" + +init_k8s_env false +require_cmd python3 "${KUBE_CMD}" + +export KUBE_CMD NAMESPACE WORK_DIR LAST_JOB_ID_FILE +exec python3 "${SCRIPT_DIR}/k8s_watch.py" "$@" diff --git a/examples/devops/openshift/scripts/start_openshift_cluster.sh b/examples/devops/openshift/scripts/start_openshift_cluster.sh new file mode 100755 index 0000000000..62245243dc --- /dev/null +++ b/examples/devops/openshift/scripts/start_openshift_cluster.sh @@ -0,0 +1,277 @@ +#!/usr/bin/env bash +set -euo pipefail + +usage() { + cat <<'EOF' +Start a Red Hat OpenShift Local (CRC) cluster and prepare a namespace for the +NVFlare OpenShift deployment examples. + +Usage: + bash start_openshift_cluster.sh [--no-login] [--no-project] + +Common environment: + CRC_BIN=crc + OC_BIN=oc + PULL_SECRET_FILE= + CRC_DISABLE_UPDATE_CHECK=true + CRC_BUNDLE= + CRC_NAMESERVER= + CRC_ENABLE_SHARED_DIRS=false + CRC_OPENSHIFT_READY_TIMEOUT=900 + CRC_OPENSHIFT_READY_INTERVAL=10 + +Optional crc start sizing overrides: + CRC_CPUS= + CRC_MEMORY= + CRC_DISK_SIZE= + +OpenShift login and project environment: + LOGIN_OPENSHIFT=true + OPENSHIFT_API_URL=https://api.crc.testing:6443 + OPENSHIFT_USER=developer + OPENSHIFT_PASSWORD=developer + OPENSHIFT_INSECURE_TLS=true + OPENSHIFT_LOGIN_RETRIES=24 + OPENSHIFT_LOGIN_INTERVAL=5 + CREATE_PROJECT=true + NAMESPACE=nvflare-e2e + +Examples: + PULL_SECRET_FILE=$HOME/Downloads/pull-secret.txt \ + bash examples/devops/openshift/scripts/start_openshift_cluster.sh + + OPENSHIFT_USER=kubeadmin OPENSHIFT_PASSWORD= \ + bash examples/devops/openshift/scripts/start_openshift_cluster.sh + + LOGIN_OPENSHIFT=false \ + bash examples/devops/openshift/scripts/start_openshift_cluster.sh +EOF +} + +fail() { + echo "ERROR: $*" >&2 + exit 1 +} + +info() { + echo + echo "==> $*" +} + +require_cmd() { + local cmd + for cmd in "$@"; do + command -v "$cmd" >/dev/null 2>&1 || fail "Required command not found: $cmd" + done +} + +crc_log_path() { + printf "%s/.crc/crc.log" "${HOME}" +} + +is_truthy() { + case "${1:-}" in + 1|[Tt][Rr][Uu][Ee]|[Yy][Ee][Ss]|[Yy]) + return 0 + ;; + *) + return 1 + ;; + esac +} + +parse_args() { + while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage + exit 0 + ;; + --no-login) + LOGIN_OPENSHIFT=false + ;; + --no-project) + CREATE_PROJECT=false + ;; + *) + fail "Unknown argument: $1" + ;; + esac + shift + done +} + +crc_status_is_running() { + local status + + status="$("${CRC_BIN}" status 2>/dev/null || true)" + grep -q "OpenShift:.*Running" <<<"${status}" +} + +wait_for_openshift_running() { + local elapsed=0 + local status="" + + info "Waiting for OpenShift to report Running" + while ((elapsed <= CRC_OPENSHIFT_READY_TIMEOUT)); do + status="$("${CRC_BIN}" status 2>/dev/null || true)" + if grep -q "OpenShift:.*Running" <<<"${status}"; then + echo "${status}" + return + fi + + echo "${status}" + echo "OpenShift is not ready; checking again in ${CRC_OPENSHIFT_READY_INTERVAL}s." + sleep "${CRC_OPENSHIFT_READY_INTERVAL}" + elapsed=$((elapsed + CRC_OPENSHIFT_READY_INTERVAL)) + done + + fail "OpenShift did not report Running within ${CRC_OPENSHIFT_READY_TIMEOUT}s. Check $(crc_log_path) or rerun '${CRC_BIN} start --log-level debug' for CRC diagnostics." +} + +start_crc() { + local start_cmd=("${CRC_BIN}" start) + + if [[ -n "${PULL_SECRET_FILE}" ]]; then + [[ -r "${PULL_SECRET_FILE}" ]] || fail "PULL_SECRET_FILE is not readable: ${PULL_SECRET_FILE}" + start_cmd+=("--pull-secret-file" "${PULL_SECRET_FILE}") + fi + if is_truthy "${CRC_DISABLE_UPDATE_CHECK}"; then + start_cmd+=("--disable-update-check") + fi + [[ -n "${CRC_BUNDLE}" ]] && start_cmd+=("--bundle" "${CRC_BUNDLE}") + [[ -n "${CRC_NAMESERVER}" ]] && start_cmd+=("--nameserver" "${CRC_NAMESERVER}") + [[ -n "${CRC_CPUS}" ]] && start_cmd+=("--cpus" "${CRC_CPUS}") + [[ -n "${CRC_MEMORY}" ]] && start_cmd+=("--memory" "${CRC_MEMORY}") + [[ -n "${CRC_DISK_SIZE}" ]] && start_cmd+=("--disk-size" "${CRC_DISK_SIZE}") + + info "Running ${start_cmd[*]}" + "${start_cmd[@]}" +} + +configure_oc() { + if command -v "${OC_BIN}" >/dev/null 2>&1; then + return + fi + + info "Adding CRC oc binary to PATH" + eval "$("${CRC_BIN}" oc-env)" + command -v "${OC_BIN}" >/dev/null 2>&1 || fail "Required command not found after crc oc-env: ${OC_BIN}" +} + +login_openshift() { + local login_cmd=("${OC_BIN}" login "${OPENSHIFT_API_URL}" "-u" "${OPENSHIFT_USER}" "-p" "${OPENSHIFT_PASSWORD}") + local attempt + local output + + if is_truthy "${OPENSHIFT_INSECURE_TLS}"; then + login_cmd+=("--insecure-skip-tls-verify=true") + fi + + info "Logging in to ${OPENSHIFT_API_URL} as ${OPENSHIFT_USER}" + for ((attempt = 1; attempt <= OPENSHIFT_LOGIN_RETRIES; attempt++)); do + if output="$("${login_cmd[@]}" 2>&1)"; then + echo "${output}" + return + fi + echo "Login attempt ${attempt}/${OPENSHIFT_LOGIN_RETRIES} failed:" + echo "${output}" + echo "Retrying in ${OPENSHIFT_LOGIN_INTERVAL}s." + sleep "${OPENSHIFT_LOGIN_INTERVAL}" + done + + echo "${output:-}" + fail "Unable to log in to OpenShift. Run '${CRC_BIN} console --credentials' if you need the kubeadmin password." +} + +ensure_project() { + if ! is_truthy "${CREATE_PROJECT}"; then + return + fi + + info "Ensuring project ${NAMESPACE}" + if "${OC_BIN}" get project "${NAMESPACE}" >/dev/null 2>&1; then + "${OC_BIN}" project "${NAMESPACE}" >/dev/null + else + "${OC_BIN}" new-project "${NAMESPACE}" >/dev/null + fi +} + +print_summary() { + info "Cluster status" + "${CRC_BIN}" status || true + + if is_truthy "${LOGIN_OPENSHIFT}"; then + info "OpenShift context" + "${OC_BIN}" whoami || true + "${OC_BIN}" project -q || true + + info "Storage classes" + "${OC_BIN}" get storageclass || true + fi + + info "Console" + "${CRC_BIN}" console --url || true + + cat <