Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion kubetest2-tf/data/k8s-ansible/roles/install-k8s/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,21 @@
delegate_facts: true
with_items: "{{ groups['all'] }}"

- name: Fail clearly when worker join command is unavailable
fail:
msg: >-
kubernetes_join_command is unavailable for worker join. Inspect earlier
master-side failures, especially kubeadm init or kubeadm token creation
on {{ groups['masters'][0] }}.
when:
- node_type == "worker"
- kubernetes_join_command is not defined or (kubernetes_join_command | trim) == ""

- name: kubeadm join worker nodes
command: >
{{ kubernetes_join_command }}
{% if ignore_preflight_errors != '' %} --ignore-preflight-errors={{ ignore_preflight_errors }}{% endif %}
when: node_type == "worker"
when:
- node_type == "worker"
- kubernetes_join_command is defined
- (kubernetes_join_command | trim) != ""
105 changes: 98 additions & 7 deletions kubetest2-tf/data/vpc/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,25 @@ resource "ibm_is_instance_template" "node_template" {
subnet = local.subnet_id
security_groups = [local.security_group_id]
}

user_data = <<-EOT
#cloud-config
users:
- default
- name: k8s-admin
shell: /bin/bash
sudo: ALL=(ALL) NOPASSWD:ALL
groups: [sudo]
ssh_authorized_keys:
- ${data.ibm_is_ssh_key.ssh_key.public_key}
runcmd:
- |
# Ensure k8s-admin SSH dir has correct permissions
mkdir -p /home/k8s-admin/.ssh
chown -R k8s-admin:k8s-admin /home/k8s-admin/.ssh
chmod 700 /home/k8s-admin/.ssh
chmod 600 /home/k8s-admin/.ssh/authorized_keys
EOT
}

module "master" {
Expand All @@ -59,32 +78,104 @@ module "workers" {
}

resource "null_resource" "wait-for-master-completes" {
depends_on = [module.master]

# First wait for cloud-init to complete using root user (still available during boot)
provisioner "local-exec" {
command = <<-EOT
max_attempts=60
attempt=0
while [ $attempt -lt $max_attempts ]; do
# Try k8s-admin first (root SSH is disabled on new IBM Cloud VPC-VSIs)
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
-i ${var.ssh_private_key} k8s-admin@${module.master.public_ip} \
"sudo cloud-init status --wait" 2>/dev/null; then
echo "Cloud-init completed on master (via k8s-admin)"
break
fi
# Fallback to root for older images that still have root SSH enabled
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
-i ${var.ssh_private_key} root@${module.master.public_ip} \
"cloud-init status --wait" 2>/dev/null; then
echo "Cloud-init completed on master (via root)"
break
fi
attempt=$((attempt + 1))
echo "Waiting for cloud-init on master (attempt $attempt/$max_attempts)..."
sleep 10
done
if [ $attempt -eq $max_attempts ]; then
echo "ERROR: Timed out waiting for cloud-init on master"
exit 1
fi
EOT
}

# Then verify k8s-admin user is accessible
connection {
type = "ssh"
user = "root"
user = "k8s-admin"
host = module.master.public_ip
private_key = file(var.ssh_private_key)
timeout = "20m"
timeout = "5m"
}
provisioner "remote-exec" {
inline = [
"cloud-init status -w"
"echo 'k8s-admin user is ready on master'"
]
}
}

resource "null_resource" "wait-for-workers-completes" {
count = var.workers_count
count = var.workers_count
depends_on = [module.workers]

# First wait for cloud-init to complete using root user (still available during boot)
provisioner "local-exec" {
command = <<-EOT
max_attempts=60
attempt=0
worker_ip="${module.workers[count.index].public_ip}"
worker_index="${count.index}"
ssh_key="${var.ssh_private_key}"

while [ $attempt -lt $max_attempts ]; do
# Try k8s-admin first (root SSH is disabled on new IBM Cloud VPC-VSIs)
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
-i "$ssh_key" k8s-admin@"$worker_ip" \
"sudo cloud-init status --wait" 2>/dev/null; then
echo "Cloud-init completed on worker $worker_index (via k8s-admin)"
break
fi
# Fallback to root for older images that still have root SSH enabled
if ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=5 \
-i "$ssh_key" root@"$worker_ip" \
"cloud-init status --wait" 2>/dev/null; then
echo "Cloud-init completed on worker $worker_index (via root)"
break
fi
attempt=$((attempt + 1))
echo "Waiting for cloud-init on worker $worker_index (attempt $attempt/$max_attempts)..."
sleep 10
done
if [ $attempt -eq $max_attempts ]; then
echo "ERROR: Timed out waiting for cloud-init on worker $worker_index"
exit 1
fi
EOT
}

# Then verify k8s-admin user is accessible
connection {
type = "ssh"
user = "root"
user = "k8s-admin"
host = module.workers[count.index].public_ip
private_key = file(var.ssh_private_key)
timeout = "15m"
timeout = "5m"
}
provisioner "remote-exec" {
inline = [
"cloud-init status -w"
"echo 'k8s-admin user is ready on worker ${count.index}'"
]
}
}
8 changes: 7 additions & 1 deletion kubetest2-tf/deployer/dumplogs.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ var commandFilename = map[string]string{
func (d *deployer) DumpClusterLogs() error {
var errors []error
var stdErr, stdOut bytes.Buffer
sshUser := "root"

// Set exclusively as maps are declared during compile-time and may be set with defaults.
commandFilename[common.CommonProvider.Runtime] = fmt.Sprintf("journalctl -xeu %s --no-pager", common.CommonProvider.Runtime)
if d.TargetProvider == "vpc" {
sshUser = "k8s-admin"
commandFilename["dmesg"] = "sudo dmesg"
commandFilename[common.CommonProvider.Runtime] = fmt.Sprintf("sudo journalctl -xeu %s --no-pager", common.CommonProvider.Runtime)
}

klog.Infof("Collecting cluster logs under %s", d.logsDir)
// create a directory based on the generated path: _rundir/dump-cluster-logs
Expand Down Expand Up @@ -69,7 +75,7 @@ func (d *deployer) DumpClusterLogs() error {
"ssh",
"-i",
common.CommonProvider.SSHPrivateKey,
fmt.Sprintf("root@%s", machineIP),
fmt.Sprintf("%s@%s", sshUser, machineIP),
command,
}
klog.V(1).Infof("Remotely executing command: %s", commandArgs)
Expand Down