From 3dbcee94c0febeece078e18b50206abe321436eb Mon Sep 17 00:00:00 2001 From: Sudharshan Muralidharan <157102595+sudharshanibm@users.noreply.github.com> Date: Tue, 3 Feb 2026 15:55:49 +0530 Subject: [PATCH] Add k8s-admin user Create a non-root k8s-admin account on instances and update Terraform/Ansible to use it. Signed-off-by: Sudharshan Muralidharan --- .../roles/install-k8s/tasks/main.yml | 15 ++- kubetest2-tf/data/vpc/main.tf | 105 ++++++++++++++++-- kubetest2-tf/deployer/dumplogs.go | 8 +- 3 files changed, 119 insertions(+), 9 deletions(-) diff --git a/kubetest2-tf/data/k8s-ansible/roles/install-k8s/tasks/main.yml b/kubetest2-tf/data/k8s-ansible/roles/install-k8s/tasks/main.yml index 92f9e671..c2d4eae0 100644 --- a/kubetest2-tf/data/k8s-ansible/roles/install-k8s/tasks/main.yml +++ b/kubetest2-tf/data/k8s-ansible/roles/install-k8s/tasks/main.yml @@ -134,8 +134,21 @@ delegate_facts: true with_items: "{{ groups['all'] }}" +- name: Fail clearly when worker join command is unavailable + fail: + msg: >- + kubernetes_join_command is unavailable for worker join. Inspect earlier + master-side failures, especially kubeadm init or kubeadm token creation + on {{ groups['masters'][0] }}. + when: + - node_type == "worker" + - kubernetes_join_command is not defined or (kubernetes_join_command | trim) == "" + - name: kubeadm join worker nodes command: > {{ kubernetes_join_command }} {% if ignore_preflight_errors != '' %} --ignore-preflight-errors={{ ignore_preflight_errors }}{% endif %} - when: node_type == "worker" + when: + - node_type == "worker" + - kubernetes_join_command is defined + - (kubernetes_join_command | trim) != "" diff --git a/kubetest2-tf/data/vpc/main.tf b/kubetest2-tf/data/vpc/main.tf index b957430e..da563577 100644 --- a/kubetest2-tf/data/vpc/main.tf +++ b/kubetest2-tf/data/vpc/main.tf @@ -37,6 +37,25 @@ resource "ibm_is_instance_template" "node_template" { subnet = local.subnet_id security_groups = [local.security_group_id] } + + user_data = <<-EOT +#cloud-config +users: + - default + - name: k8s-admin + shell: /bin/bash + sudo: ALL=(ALL) NOPASSWD:ALL + groups: [sudo] + ssh_authorized_keys: + - ${data.ibm_is_ssh_key.ssh_key.public_key} +runcmd: + - | + # Ensure k8s-admin SSH dir has correct permissions + mkdir -p /home/k8s-admin/.ssh + chown -R k8s-admin:k8s-admin /home/k8s-admin/.ssh + chmod 700 /home/k8s-admin/.ssh + chmod 600 /home/k8s-admin/.ssh/authorized_keys +EOT } module "master" { @@ -59,32 +78,104 @@ module "workers" { } resource "null_resource" "wait-for-master-completes" { + depends_on = [module.master] + + # First wait for cloud-init to complete using root user (still available during boot) + provisioner "local-exec" { + command = <<-EOT + max_attempts=60 + attempt=0 + while [ $attempt -lt $max_attempts ]; do + # Try k8s-admin first (root SSH is disabled on new IBM Cloud VPC-VSIs) + if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \ + -i ${var.ssh_private_key} k8s-admin@${module.master.public_ip} \ + "sudo cloud-init status --wait" 2>/dev/null; then + echo "Cloud-init completed on master (via k8s-admin)" + break + fi + # Fallback to root for older images that still have root SSH enabled + if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \ + -i ${var.ssh_private_key} root@${module.master.public_ip} \ + "cloud-init status --wait" 2>/dev/null; then + echo "Cloud-init completed on master (via root)" + break + fi + attempt=$((attempt + 1)) + echo "Waiting for cloud-init on master (attempt $attempt/$max_attempts)..." + sleep 10 + done + if [ $attempt -eq $max_attempts ]; then + echo "ERROR: Timed out waiting for cloud-init on master" + exit 1 + fi + EOT + } + + # Then verify k8s-admin user is accessible connection { type = "ssh" - user = "root" + user = "k8s-admin" host = module.master.public_ip private_key = file(var.ssh_private_key) - timeout = "20m" + timeout = "5m" } provisioner "remote-exec" { inline = [ - "cloud-init status -w" + "echo 'k8s-admin user is ready on master'" ] } } resource "null_resource" "wait-for-workers-completes" { - count = var.workers_count + count = var.workers_count + depends_on = [module.workers] + + # First wait for cloud-init to complete using root user (still available during boot) + provisioner "local-exec" { + command = <<-EOT + max_attempts=60 + attempt=0 + worker_ip="${module.workers[count.index].public_ip}" + worker_index="${count.index}" + ssh_key="${var.ssh_private_key}" + + while [ $attempt -lt $max_attempts ]; do + # Try k8s-admin first (root SSH is disabled on new IBM Cloud VPC-VSIs) + if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \ + -i "$ssh_key" k8s-admin@"$worker_ip" \ + "sudo cloud-init status --wait" 2>/dev/null; then + echo "Cloud-init completed on worker $worker_index (via k8s-admin)" + break + fi + # Fallback to root for older images that still have root SSH enabled + if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \ + -i "$ssh_key" root@"$worker_ip" \ + "cloud-init status --wait" 2>/dev/null; then + echo "Cloud-init completed on worker $worker_index (via root)" + break + fi + attempt=$((attempt + 1)) + echo "Waiting for cloud-init on worker $worker_index (attempt $attempt/$max_attempts)..." + sleep 10 + done + if [ $attempt -eq $max_attempts ]; then + echo "ERROR: Timed out waiting for cloud-init on worker $worker_index" + exit 1 + fi + EOT + } + + # Then verify k8s-admin user is accessible connection { type = "ssh" - user = "root" + user = "k8s-admin" host = module.workers[count.index].public_ip private_key = file(var.ssh_private_key) - timeout = "15m" + timeout = "5m" } provisioner "remote-exec" { inline = [ - "cloud-init status -w" + "echo 'k8s-admin user is ready on worker ${count.index}'" ] } } diff --git a/kubetest2-tf/deployer/dumplogs.go b/kubetest2-tf/deployer/dumplogs.go index 6cf834e6..ba0b0b76 100644 --- a/kubetest2-tf/deployer/dumplogs.go +++ b/kubetest2-tf/deployer/dumplogs.go @@ -20,9 +20,15 @@ var commandFilename = map[string]string{ func (d *deployer) DumpClusterLogs() error { var errors []error var stdErr, stdOut bytes.Buffer + sshUser := "root" // Set exclusively as maps are declared during compile-time and may be set with defaults. commandFilename[common.CommonProvider.Runtime] = fmt.Sprintf("journalctl -xeu %s --no-pager", common.CommonProvider.Runtime) + if d.TargetProvider == "vpc" { + sshUser = "k8s-admin" + commandFilename["dmesg"] = "sudo dmesg" + commandFilename[common.CommonProvider.Runtime] = fmt.Sprintf("sudo journalctl -xeu %s --no-pager", common.CommonProvider.Runtime) + } klog.Infof("Collecting cluster logs under %s", d.logsDir) // create a directory based on the generated path: _rundir/dump-cluster-logs @@ -69,7 +75,7 @@ func (d *deployer) DumpClusterLogs() error { "ssh", "-i", common.CommonProvider.SSHPrivateKey, - fmt.Sprintf("root@%s", machineIP), + fmt.Sprintf("%s@%s", sshUser, machineIP), command, } klog.V(1).Infof("Remotely executing command: %s", commandArgs)