Skip to content

Commit 8b9d609

Browse files
committed
Add k8s-admin user
Create a non-root k8s-admin account on instances and update Terraform/Ansible to use it. Signed-off-by: Sudharshan Muralidharan <sudharshan.muralidharan1@ibm.com>
1 parent 168fd50 commit 8b9d609

3 files changed

Lines changed: 119 additions & 9 deletions

File tree

kubetest2-tf/data/k8s-ansible/roles/install-k8s/tasks/main.yml

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,8 +134,21 @@
134134
delegate_facts: true
135135
with_items: "{{ groups['all'] }}"
136136

137+
- name: Fail clearly when worker join command is unavailable
138+
fail:
139+
msg: >-
140+
kubernetes_join_command is unavailable for worker join. Inspect earlier
141+
master-side failures, especially kubeadm init or kubeadm token creation
142+
on {{ groups['masters'][0] }}.
143+
when:
144+
- node_type == "worker"
145+
- kubernetes_join_command is not defined or (kubernetes_join_command | trim) == ""
146+
137147
- name: kubeadm join worker nodes
138148
command: >
139149
{{ kubernetes_join_command }}
140150
{% if ignore_preflight_errors != '' %} --ignore-preflight-errors={{ ignore_preflight_errors }}{% endif %}
141-
when: node_type == "worker"
151+
when:
152+
- node_type == "worker"
153+
- kubernetes_join_command is defined
154+
- (kubernetes_join_command | trim) != ""

kubetest2-tf/data/vpc/main.tf

Lines changed: 98 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,25 @@ resource "ibm_is_instance_template" "node_template" {
3737
subnet = local.subnet_id
3838
security_groups = [local.security_group_id]
3939
}
40+
41+
user_data = <<-EOT
42+
#cloud-config
43+
users:
44+
- default
45+
- name: k8s-admin
46+
shell: /bin/bash
47+
sudo: ALL=(ALL) NOPASSWD:ALL
48+
groups: [sudo]
49+
ssh_authorized_keys:
50+
- ${data.ibm_is_ssh_key.ssh_key.public_key}
51+
runcmd:
52+
- |
53+
# Ensure k8s-admin SSH dir has correct permissions
54+
mkdir -p /home/k8s-admin/.ssh
55+
chown -R k8s-admin:k8s-admin /home/k8s-admin/.ssh
56+
chmod 700 /home/k8s-admin/.ssh
57+
chmod 600 /home/k8s-admin/.ssh/authorized_keys
58+
EOT
4059
}
4160

4261
module "master" {
@@ -59,32 +78,104 @@ module "workers" {
5978
}
6079

6180
resource "null_resource" "wait-for-master-completes" {
81+
depends_on = [module.master]
82+
83+
# First wait for cloud-init to complete using root user (still available during boot)
84+
provisioner "local-exec" {
85+
command = <<-EOT
86+
max_attempts=60
87+
attempt=0
88+
while [ $attempt -lt $max_attempts ]; do
89+
# Try k8s-admin first (root SSH is disabled on new IBM Cloud VPC-VSIs)
90+
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
91+
-i ${var.ssh_private_key} k8s-admin@${module.master.public_ip} \
92+
"sudo cloud-init status --wait" 2>/dev/null; then
93+
echo "Cloud-init completed on master (via k8s-admin)"
94+
break
95+
fi
96+
# Fallback to root for older images that still have root SSH enabled
97+
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
98+
-i ${var.ssh_private_key} root@${module.master.public_ip} \
99+
"cloud-init status --wait" 2>/dev/null; then
100+
echo "Cloud-init completed on master (via root)"
101+
break
102+
fi
103+
attempt=$((attempt + 1))
104+
echo "Waiting for cloud-init on master (attempt $attempt/$max_attempts)..."
105+
sleep 10
106+
done
107+
if [ $attempt -eq $max_attempts ]; then
108+
echo "ERROR: Timed out waiting for cloud-init on master"
109+
exit 1
110+
fi
111+
EOT
112+
}
113+
114+
# Then verify k8s-admin user is accessible
62115
connection {
63116
type = "ssh"
64-
user = "root"
117+
user = "k8s-admin"
65118
host = module.master.public_ip
66119
private_key = file(var.ssh_private_key)
67-
timeout = "20m"
120+
timeout = "5m"
68121
}
69122
provisioner "remote-exec" {
70123
inline = [
71-
"cloud-init status -w"
124+
"echo 'k8s-admin user is ready on master'"
72125
]
73126
}
74127
}
75128

76129
resource "null_resource" "wait-for-workers-completes" {
77-
count = var.workers_count
130+
count = var.workers_count
131+
depends_on = [module.workers]
132+
133+
# First wait for cloud-init to complete using root user (still available during boot)
134+
provisioner "local-exec" {
135+
command = <<-EOT
136+
max_attempts=60
137+
attempt=0
138+
worker_ip="${module.workers[count.index].public_ip}"
139+
worker_index="${count.index}"
140+
ssh_key="${var.ssh_private_key}"
141+
142+
while [ $attempt -lt $max_attempts ]; do
143+
# Try k8s-admin first (root SSH is disabled on new IBM Cloud VPC-VSIs)
144+
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
145+
-i "$ssh_key" k8s-admin@"$worker_ip" \
146+
"sudo cloud-init status --wait" 2>/dev/null; then
147+
echo "Cloud-init completed on worker $worker_index (via k8s-admin)"
148+
break
149+
fi
150+
# Fallback to root for older images that still have root SSH enabled
151+
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
152+
-i "$ssh_key" root@"$worker_ip" \
153+
"cloud-init status --wait" 2>/dev/null; then
154+
echo "Cloud-init completed on worker $worker_index (via root)"
155+
break
156+
fi
157+
attempt=$((attempt + 1))
158+
echo "Waiting for cloud-init on worker $worker_index (attempt $attempt/$max_attempts)..."
159+
sleep 10
160+
done
161+
if [ $attempt -eq $max_attempts ]; then
162+
echo "ERROR: Timed out waiting for cloud-init on worker $worker_index"
163+
exit 1
164+
fi
165+
EOT
166+
}
167+
168+
# Then verify k8s-admin user is accessible
78169
connection {
79170
type = "ssh"
80-
user = "root"
171+
user = "k8s-admin"
81172
host = module.workers[count.index].public_ip
82173
private_key = file(var.ssh_private_key)
83-
timeout = "15m"
174+
timeout = "5m"
84175
}
85176
provisioner "remote-exec" {
86177
inline = [
87-
"cloud-init status -w"
178+
"echo 'k8s-admin user is ready on worker ${count.index}'"
88179
]
89180
}
90181
}

kubetest2-tf/deployer/dumplogs.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,15 @@ var commandFilename = map[string]string{
2020
func (d *deployer) DumpClusterLogs() error {
2121
var errors []error
2222
var stdErr, stdOut bytes.Buffer
23+
sshUser := "root"
2324

2425
// Set exclusively as maps are declared during compile-time and may be set with defaults.
2526
commandFilename[common.CommonProvider.Runtime] = fmt.Sprintf("journalctl -xeu %s --no-pager", common.CommonProvider.Runtime)
27+
if d.TargetProvider == "vpc" {
28+
sshUser = "k8s-admin"
29+
commandFilename["dmesg"] = "sudo dmesg"
30+
commandFilename[common.CommonProvider.Runtime] = fmt.Sprintf("sudo journalctl -xeu %s --no-pager", common.CommonProvider.Runtime)
31+
}
2632

2733
klog.Infof("Collecting cluster logs under %s", d.logsDir)
2834
// create a directory based on the generated path: _rundir/dump-cluster-logs
@@ -69,7 +75,7 @@ func (d *deployer) DumpClusterLogs() error {
6975
"ssh",
7076
"-i",
7177
common.CommonProvider.SSHPrivateKey,
72-
fmt.Sprintf("root@%s", machineIP),
78+
fmt.Sprintf("%s@%s", sshUser, machineIP),
7379
command,
7480
}
7581
klog.V(1).Infof("Remotely executing command: %s", commandArgs)

0 commit comments

Comments
 (0)