windows-machine-config-operator/test/e2e/validation_test.go at 0011f27d30bb793d54d8e374d145157282b69749 · openshift/windows-machine-config-operator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
package e2e

import (
	"context"
	"errors"
	"fmt"
	"log"
	"os/exec"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"testing"

	config "github.com/openshift/api/config/v1"
	operators "github.com/operator-framework/api/pkg/operators/v2"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	batch "k8s.io/api/batch/v1"
	certificates "k8s.io/api/certificates/v1"
	core "k8s.io/api/core/v1"
	rbac "k8s.io/api/rbac/v1"
	apierrors "k8s.io/apimachinery/pkg/api/errors"
	meta "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/util/wait"
	cloudproviderapi "k8s.io/cloud-provider/api"

	"github.com/openshift/windows-machine-config-operator/controllers"
	"github.com/openshift/windows-machine-config-operator/pkg/condition"
	"github.com/openshift/windows-machine-config-operator/pkg/crypto"
	"github.com/openshift/windows-machine-config-operator/pkg/csr/validation"
	"github.com/openshift/windows-machine-config-operator/pkg/metadata"
	nc "github.com/openshift/windows-machine-config-operator/pkg/nodeconfig"
	"github.com/openshift/windows-machine-config-operator/pkg/retry"
	"github.com/openshift/windows-machine-config-operator/pkg/secrets"
	"github.com/openshift/windows-machine-config-operator/pkg/servicescm"
	"github.com/openshift/windows-machine-config-operator/pkg/windows"
)

const (
	// wmcoContainerName is the name of the container in the deployment spec of the operator
	wmcoContainerName = "manager"
)

// versionRegex captures the version from the output of the WMCO version command
// example: captures `5.0.0-1b759bf1-dirty` from the string
// `windows-machine-config-operator version: "5.0.0-1b759bf1-dirty", go version: "go1.17.5 linux/amd64"`
var versionRegex = regexp.MustCompile(`version: "([^"]*)"`)

// winService contains information regarding a Windows service's current state
type winService struct {
	state       string
	description string
}

// testNodesBecomeReadyAndSchedulable tests that all Windows nodes become ready and schedulable
func (tc *testContext) testNodesBecomeReadyAndSchedulable(t *testing.T) {
	nodes := gc.allNodes()
	for _, node := range nodes {
		t.Run(node.GetName(), func(t *testing.T) {
			err := wait.PollImmediate(retry.Interval, retry.ResourceChangeTimeout, func() (done bool, err error) {
				foundNode, err := tc.client.K8s.CoreV1().Nodes().Get(context.TODO(), node.GetName(), meta.GetOptions{})
				require.NoError(t, err)
				return tc.nodeReadyAndSchedulable(*foundNode), nil
			})
			assert.NoError(t, err)
		})
	}
}

// nodeReadyAndSchedulable returns true if the node is both ready and is not marked as unschedulable
func (tc *testContext) nodeReadyAndSchedulable(node core.Node) bool {
	readyCondition := false
	for _, condition := range node.Status.Conditions {
		if condition.Type == core.NodeReady {
			readyCondition = true
		}
		if readyCondition && condition.Status != core.ConditionTrue {
			log.Printf("node %v is expected to be in Ready state", node.Name)
			return false
		}
	}
	if !readyCondition {
		log.Printf("expected node Status to have condition type Ready for node %v", node.Name)
		return false
	}
	// this taint is applied by WMCO some at point after WICD configures the node
	for _, taint := range node.Spec.Taints {
		if taint.Key == cloudproviderapi.TaintExternalCloudProvider && taint.Effect == core.TaintEffectNoSchedule {
			log.Printf("expected node %s to not have the external cloud provider taint", node.GetName())
			return false
		}
	}
	// WMCO will uncordon the node at some point after WICD configures it
	if node.Spec.Unschedulable {
		log.Printf("expected node %s to be schedulable", node.Name)
		return false
	}
	return true
}

// testKubeletPriorityClass tests if kubelet priority class is set to "AboveNormal"
func (tc *testContext) testKubeletPriorityClass(t *testing.T) {
	requiredPriorityClass := "AboveNormal"

	require.Greater(t, len(gc.allNodes()), 0, "test requires at least one Windows node to run")
	for _, node := range gc.allNodes() {
		t.Run(node.Name, func(t *testing.T) {
			out, err := tc.getKubeletPriorityClass(&node)
			require.NoError(t, err, "error getting kubelet priority class")
			assert.Containsf(t, out, requiredPriorityClass, "node %s missing required kubelet priority class",
				node.GetName())
		})
	}
}

// getKubeletPriorityClass returns the priority class of the kubelet service
func (tc *testContext) getKubeletPriorityClass(node *core.Node) (string, error) {
	command := "Get-Process kubelet | Select-Object PriorityClass"
	addr, err := controllers.GetAddress(node.Status.Addresses)
	if err != nil {
		return "", fmt.Errorf("error getting node address: %w", err)
	}
	out, err := tc.runPowerShellSSHJob("kubelet-priority-class-query", command, addr)
	if err != nil {
		return "", fmt.Errorf("error querying kubelet service for priority class: %w", err)
	}
	return out, nil
}

// testNodeMetadata tests if all nodes have a worker label and are annotated with the version of
// the currently deployed WMCO
func (tc *testContext) testNodeMetadata(t *testing.T) {
	operatorVersion, err := getWMCOVersion()
	require.NoError(t, err, "could not get WMCO version")

	_, pubKey, err := tc.getExpectedKeyPair()
	require.NoError(t, err, "error getting the expected public/private key pair")
	pubKeyAnnotation := nc.CreatePubKeyHashAnnotation(pubKey)

	for _, node := range gc.allNodes() {
		t.Run(node.GetName()+" Validation Tests", func(t *testing.T) {
			// The worker label is not actually added by WMCO however we would like to validate if the Machine Api is
			// properly adding the worker label, if it was specified in the MachineSet. The MachineSet created in the
			// test suite has the worker label
			t.Run("Worker Label", func(t *testing.T) {
				assert.Contains(t, node.Labels, nc.WorkerLabel, "expected node label %s was not present on %s",
					nc.WorkerLabel, node.GetName())
			})
			t.Run("Version Annotation", func(t *testing.T) {
				require.Containsf(t, node.Annotations, metadata.VersionAnnotation, "node %s missing version annotation",
					node.GetName())
				assert.Equal(t, operatorVersion, node.Annotations[metadata.VersionAnnotation],
					"WMCO version annotation mismatch")
			})
			t.Run("Public Key Annotation", func(t *testing.T) {
				require.Containsf(t, node.Annotations, nc.PubKeyHashAnnotation, "node %s missing public key annotation",
					node.GetName())
				assert.Equal(t, pubKeyAnnotation, node.Annotations[nc.PubKeyHashAnnotation],
					"public key annotation mismatch")
			})
		})
	}
	t.Run("Windows node metadata not applied to Linux nodes", func(t *testing.T) {
		nodes, err := tc.client.K8s.CoreV1().Nodes().List(context.TODO(), meta.ListOptions{
			LabelSelector: core.LabelOSStable + "=linux"})
		require.NoError(t, err, "error listing Linux nodes")
		for _, node := range nodes.Items {
			assert.NotContainsf(t, node.Annotations, metadata.VersionAnnotation,
				"version annotation applied to Linux node %s", node.GetName())
			assert.NotContainsf(t, node.Annotations, nc.PubKeyHashAnnotation,
				"public key annotation applied to Linux node %s", node.GetName())
		}
	})
}

// getWMCOVersion returns the version that the operator reports
func getWMCOVersion() (string, error) {
	cmd := exec.Command("oc", "exec", "deploy/windows-machine-config-operator", "-n", wmcoNamespace, "--",
		"/usr/local/bin/windows-machine-config-operator", "version")
	out, err := cmd.Output()
	if err != nil {
		var exitError *exec.ExitError
		stderr := ""
		if errors.As(err, &exitError) {
			stderr = string(exitError.Stderr)
		}
		return "", fmt.Errorf("oc exec failed with exit code %s and output: %s: %s", err, string(out), stderr)
	}
	// out is formatted like:
	// windows-machine-config-operator version: "5.0.0-1b759bf1-dirty", go version: "go1.17.5 linux/amd64"
	matches := versionRegex.FindStringSubmatch(string(out))
	if len(matches) < 2 {
		return "", fmt.Errorf("could not parse version from '%s'", string(out))
	}
	return matches[1], nil
}

// testNodeTaint tests if the Windows node has the Windows taint
func (tc *testContext) testNodeTaint(t *testing.T) {
	// windowsTaint is the taint that needs to be applied to the Windows node
	windowsTaint := core.Taint{
		Key:    "os",
		Value:  "Windows",
		Effect: core.TaintEffectNoSchedule,
	}

	for _, node := range gc.allNodes() {
		hasTaint := func() bool {
			for _, taint := range node.Spec.Taints {
				if taint.Key == windowsTaint.Key && taint.Value == windowsTaint.Value && taint.Effect == windowsTaint.Effect {
					return true
				}
			}
			return false
		}()
		assert.Equal(t, hasTaint, true, "expected Windows Taint to be present on the Node: %s", node.GetName())
	}
}

// ensureTestRunnerSA ensures the proper ServiceAccount exists, a requirement for SSHing into a Windows node
// noop if the ServiceAccount already exists.
func (tc *testContext) ensureTestRunnerSA() error {
	sa := core.ServiceAccount{ObjectMeta: meta.ObjectMeta{Name: tc.workloadNamespace}}
	_, err := tc.client.K8s.CoreV1().ServiceAccounts(tc.workloadNamespace).Create(context.TODO(), &sa,
		meta.CreateOptions{})
	if err != nil && !apierrors.IsAlreadyExists(err) {
		return fmt.Errorf("unable to create SA: %w", err)
	}
	return nil
}

// ensureTestRunnerRole ensures the proper Role exists, a requirement for SSHing into a Windows node
// noop if the Role already exists.
func (tc *testContext) ensureTestRunnerRole(ctx context.Context) error {
	role := rbac.Role{
		TypeMeta:   meta.TypeMeta{},
		ObjectMeta: meta.ObjectMeta{Name: tc.workloadNamespace},
		Rules: []rbac.PolicyRule{
			{
				Verbs:         []string{"use"},
				APIGroups:     []string{"security.openshift.io"},
				Resources:     []string{"securitycontextconstraints"},
				ResourceNames: []string{"hostnetwork"},
			},
		},
	}
	_, err := tc.client.K8s.RbacV1().Roles(tc.workloadNamespace).Create(ctx, &role, meta.CreateOptions{})
	if err != nil && !apierrors.IsAlreadyExists(err) {
		return fmt.Errorf("unable to create role: %w", err)
	}
	return nil
}

// ensureTestRunnerClusterRole ensures the proper ClusterRole exists, a requirement for listing Windows node
// noop if the ClusterRole already exists. Nodes are cluster-scoped resources, and only ClusterRoles
// can grant permissions to cluster-scoped resources.
func (tc *testContext) ensureTestRunnerClusterRole(ctx context.Context) error {
	clusterRole := rbac.ClusterRole{
		TypeMeta: meta.TypeMeta{},
		ObjectMeta: meta.ObjectMeta{
			Name: tc.workloadNamespace,
		},
		Rules: []rbac.PolicyRule{
			{
				Resources: []string{"nodes"},
				APIGroups: []string{""},
				Verbs:     []string{"get", "list"},
			},
		},
	}
	_, err := tc.client.K8s.RbacV1().ClusterRoles().Create(ctx, &clusterRole, meta.CreateOptions{})
	if err != nil && !apierrors.IsAlreadyExists(err) {
		return fmt.Errorf("unable to create role: %w", err)
	}
	return nil
}

// ensureTestRunnerRoleBinding ensures the proper RoleBinding exists, a requirement for SSHing into a Windows node
// noop if the RoleBinding already exists.
func (tc *testContext) ensureTestRunnerRoleBinding(ctx context.Context) error {
	rb := rbac.RoleBinding{
		TypeMeta:   meta.TypeMeta{},
		ObjectMeta: meta.ObjectMeta{Name: tc.workloadNamespace},
		Subjects: []rbac.Subject{{
			Kind:      "ServiceAccount",
			APIGroup:  "",
			Name:      tc.workloadNamespace,
			Namespace: tc.workloadNamespace,
		}},
		RoleRef: rbac.RoleRef{
			APIGroup: "rbac.authorization.k8s.io",
			Kind:     "Role",
			Name:     tc.workloadNamespace,
		},
	}
	_, err := tc.client.K8s.RbacV1().RoleBindings(tc.workloadNamespace).Create(ctx, &rb, meta.CreateOptions{})
	if err != nil && !apierrors.IsAlreadyExists(err) {
		return fmt.Errorf("unable to create role: %w", err)
	}
	return nil
}

// ensureTestRunnerClusterRoleBinding ensures the proper ClusterRoleBinding exists, a requirement for listing
// Windows nodes, noop if the ClusterRoleBinding already exists.
func (tc *testContext) ensureTestRunnerClusterRoleBinding(ctx context.Context) error {
	crb := rbac.ClusterRoleBinding{
		TypeMeta: meta.TypeMeta{},
		ObjectMeta: meta.ObjectMeta{
			Name: tc.workloadNamespace,
		},
		Subjects: []rbac.Subject{{
			Kind:      rbac.ServiceAccountKind,
			Name:      tc.workloadNamespace,
			Namespace: tc.workloadNamespace,
		}},
		RoleRef: rbac.RoleRef{
			APIGroup: rbac.GroupName,
			Kind:     "ClusterRole",
			Name:     tc.workloadNamespace,
		},
	}
	_, err := tc.client.K8s.RbacV1().ClusterRoleBindings().Create(ctx, &crb, meta.CreateOptions{})
	if err != nil && !apierrors.IsAlreadyExists(err) {
		return fmt.Errorf("unable to create cluster role: %w", err)
	}
	return nil
}

// ensureTestRunnerRBAC creates the RBAC resources required for the test runner service account
func (tc *testContext) ensureTestRunnerRBAC() error {
	ctx := context.TODO()
	if err := tc.ensureTestRunnerSA(); err != nil {
		return fmt.Errorf("error ensuring SA created: %w", err)
	}
	if err := tc.ensureTestRunnerClusterRole(ctx); err != nil {
		return fmt.Errorf("error ensuring Role created: %w", err)
	}
	if err := tc.ensureTestRunnerClusterRoleBinding(ctx); err != nil {
		return fmt.Errorf("error ensuring RoleBinding created: %w", err)
	}
	if err := tc.ensureTestRunnerRole(ctx); err != nil {
		return fmt.Errorf("error ensuring Role created: %w", err)
	}
	if err := tc.ensureTestRunnerRoleBinding(ctx); err != nil {
		return fmt.Errorf("error ensuring RoleBinding created: %w", err)
	}
	return nil
}

// runPowerShellSSHJob creates and waits for a Kubernetes job to run. The command provided will be executed through
// PowerShell, on the host specified by the provided IP.
func (tc *testContext) runPowerShellSSHJob(name, command, ip string) (string, error) {
	// Modify command to work when default shell is the newer Powershell version present on Windows Server 2022 and
	// later
	powershellDefaultCommand := strings.ReplaceAll(command, "\\\"", "\"")

	keyMountDir := "/private-key"
	sshOptions := "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
	sshCommand := []string{"bash", "-c",
		fmt.Sprintf(
			// first determine if the host has PowerShell or cmd as the default shell by running a simple PowerShell
			// command. If it succeeds, then the host's default shell is PowerShell
			"if ssh "+sshOptions+" -i %s %s@%s 'Get-Help';"+
				"then CMD_PREFIX=\"\";CMD_SUFFIX=\"\";"+
				// to respect quoting within the given command, wrap the command as a script block
				"COMMAND='{"+powershellDefaultCommand+"}';"+
				// if PowerShell is not the default shell, explicitly run the unmodified command through PowerShell
				"else CMD_PREFIX=\""+remotePowerShellCmdPrefix+" \\\"\";CMD_SUFFIX=\"\\\"\";"+
				"COMMAND='{"+command+"}';"+
				"fi;"+
				// execute the command as a script block via the PowerShell call operator `&`
				"ssh "+sshOptions+" -i %s %s@%s ${CMD_PREFIX}\" & $COMMAND \"${CMD_SUFFIX}",
			filepath.Join(keyMountDir, secrets.PrivateKeySecretKey), tc.vmUsername(), ip,
			filepath.Join(keyMountDir, secrets.PrivateKeySecretKey), tc.vmUsername(), ip)}

	return tc.runJob(name, sshCommand)
}

// runJob creates and waits for a Kubernetes job to run. The command provided will be executed on a Linux worker,
// using the tools image.
func (tc *testContext) runJob(name string, command []string) (string, error) {
	// Create a job which runs the provided command via SSH
	keyMountDir := "/private-key"
	keyMode := int32(0600)
	job := &batch.Job{
		ObjectMeta: meta.ObjectMeta{
			GenerateName: name + "-job-",
		},
		Spec: batch.JobSpec{
			Template: core.PodTemplateSpec{
				Spec: core.PodSpec{
					OS:                 &core.PodOS{Name: core.Linux},
					HostNetwork:        true,
					RestartPolicy:      core.RestartPolicyNever,
					ServiceAccountName: tc.workloadNamespace,
					Containers: []core.Container{
						{
							Name:            name,
							Image:           tc.toolsImage,
							ImagePullPolicy: core.PullIfNotPresent,
							Command:         command,
							VolumeMounts: []core.VolumeMount{{
								Name:      "private-key",
								MountPath: keyMountDir,
							}},
						},
					},
					Volumes: []core.Volume{{Name: "private-key", VolumeSource: core.VolumeSource{
						Secret: &core.SecretVolumeSource{
							SecretName:  secrets.PrivateKeySecret,
							DefaultMode: &keyMode,
						},
					}}},
				},
			},
		},
	}

	jobsClient := tc.client.K8s.BatchV1().Jobs(tc.workloadNamespace)
	job, err := jobsClient.Create(context.TODO(), job, meta.CreateOptions{})
	if err != nil {
		return "", fmt.Errorf("error creating job: %w", err)
	}

	// Wait for the job to complete then gather and return the pod output
	logs, err := tc.waitUntilJobSucceeds(job.GetName())
	if err != nil {
		return "", fmt.Errorf("error waiting for job to succeed: %w", err)
	}
	return logs, nil
}

// getWinServices returns a map of Windows services from the instance with the given address, the map key being the
// service's name
func (tc *testContext) getWinServices(addr string) (map[string]winService, error) {
	// This command returns CR+newline separated quoted CSV entries consisting of service name, state and description.
	// For example: "kubelet","Running","OpenShift managed kubelet"\r\n"VaultSvc","Stopped",
	command := "Get-CimInstance -ClassName Win32_Service | Select-Object -Property Name,State,Description | " +
		"ConvertTo-Csv -NoTypeInformation"
	out, err := tc.runPowerShellSSHJob("get-windows-svc-list", command, addr)
	if err != nil {
		return nil, fmt.Errorf("error running SSH job: %w", err)
	}

	// Remove the header and trailing whitespace from the command output
	outSplit := strings.SplitAfterN(out, "\"Name\",\"State\",\"Description\"\r\n", 2)
	if len(outSplit) != 2 {
		return nil, fmt.Errorf("unexpected command output: %s", out)
	}
	trimmedList := strings.TrimSpace(outSplit[1])

	// Make a map from the services, removing the quotes around each entry
	services := make(map[string]winService)
	lines := strings.Split(trimmedList, "\r\n")
	for _, line := range lines {
		// Split into 3 substrings, Name, State, Description. The description can contain a comma, so SplitN is required
		fields := strings.SplitN(line, ",", 3)
		if len(fields) != 3 {
			return nil, fmt.Errorf("expected comma separated values, found: %s", line)
		}
		name := strings.Trim(fields[0], "\"")
		state := strings.Trim(fields[1], "\"")
		description := strings.Trim(fields[2], "\"")

		services[name] = winService{state: state, description: description}
	}
	return services, nil
}

// testExpectedServicesRunning tests that for each node all the expected services are running
func (tc *testContext) testExpectedServicesRunning(t *testing.T) {
	expectedSvcs, err := tc.expectedWindowsServices(windows.RequiredServices)
	require.NoError(t, err)
	for _, node := range gc.allNodes() {
		t.Run(node.GetName(), func(t *testing.T) {
			addr, err := controllers.GetAddress(node.Status.Addresses)
			require.NoError(t, err, "unable to get node address")
			svcs, err := tc.getWinServices(addr)
			require.NoError(t, err, "error getting service map")
			for svcName, shouldBeRunning := range expectedSvcs {
				t.Run(svcName, func(t *testing.T) {
					if shouldBeRunning {
						require.Contains(t, svcs, svcName, "service not found")
						assert.Equal(t, "Running", svcs[svcName].state)
						assert.Contains(t, svcs[svcName].description, windows.ManagedTag)
					} else {
						require.NotContains(t, svcs, svcName, "service exists when it shouldn't")
					}
				})
			}
		})
	}
}

// expectedWindowsServices returns a map of the names of the WMCO owned Windows services, with a value indicating if it
// should or should not be running on the instance.
func (tc *testContext) expectedWindowsServices(alwaysRequiredSvcs []string) (map[string]bool, error) {
	serviceMap := make(map[string]bool)
	for _, svc := range alwaysRequiredSvcs {
		serviceMap[svc] = true
	}
	if tc.CloudProvider.GetType() == config.AzurePlatformType {
		serviceMap[windows.AzureCloudNodeManagerServiceName] = true
	} else {
		serviceMap[windows.AzureCloudNodeManagerServiceName] = false
	}
	return serviceMap, nil
}

// testServicesConfigMap tests multiple aspects of expected functionality for the services ConfigMap
// 1. It exists on operator startup 2. It is re-created when deleted 3. It is recreated if invalid contents are detected
func (tc *testContext) testServicesConfigMap(t *testing.T) {
	operatorVersion, err := getWMCOVersion()
	require.NoError(t, err)
	servicesConfigMapName := servicescm.NamePrefix + operatorVersion

	// Ensure the windows-services ConfigMap exists in the cluster
	var cmData *servicescm.Data
	t.Run("Services ConfigMap contents", func(t *testing.T) {
		// Get CM and parse data
		cm, err := tc.client.K8s.CoreV1().ConfigMaps(wmcoNamespace).Get(context.TODO(), servicesConfigMapName,
			meta.GetOptions{})
		require.NoErrorf(t, err, "error ensuring ConfigMap %s exists", servicesConfigMapName)
		cmData, err = servicescm.Parse(cm.Data)
		require.NoError(t, err, "unable to parse ConfigMap data")

		// Check that only the expected services are defined within the CM data. WICD itself should not be defined in it
		expectedSvcs, err := tc.expectedWindowsServices(windows.RequiredServices)
		expectedSvcs[windows.WicdServiceName] = false
		require.NoError(t, err)
		for svcName, shouldBeInConfigMap := range expectedSvcs {
			t.Run(svcName, func(t *testing.T) {
				assert.Equalf(t, shouldBeInConfigMap, containsService(svcName, cmData.Services),
					"service existence should be %t", shouldBeInConfigMap)
			})
		}
	})

	t.Run("Services ConfigMap re-creation", func(t *testing.T) {
		err = tc.testServicesCMRegeneration(servicesConfigMapName, cmData)
		assert.NoErrorf(t, err, "error ensuring ConfigMap %s is re-created when deleted", servicesConfigMapName)
	})

	t.Run("Invalid services ConfigMap deletion", func(t *testing.T) {
		err = tc.testInvalidServicesCM(servicesConfigMapName, cmData)
		assert.NoError(t, err, "error testing handling of invalid ConfigMap")
	})
}

// containsService returns true if the given service exists within the services list
func containsService(name string, services []servicescm.Service) bool {
	for _, svc := range services {
		if svc.Name == name {
			return true
		}
	}
	return false
}

// testServicesCMRegeneration tests that if the services ConfigMap is deleted, a valid one is re-created in its place
func (tc *testContext) testServicesCMRegeneration(cmName string, expected *servicescm.Data) error {
	err := tc.client.K8s.CoreV1().ConfigMaps(wmcoNamespace).Delete(context.TODO(), cmName, meta.DeleteOptions{})
	if err != nil {
		return err
	}
	_, err = tc.waitForValidWindowsServicesConfigMap(cmName, expected)
	return err
}

// testInvalidServicesCM tests that an invalid services ConfigMap is deleted and a valid one is re-created in its place
func (tc *testContext) testInvalidServicesCM(cmName string, expected *servicescm.Data) error {
	// Scale down the WMCO deployment to 0
	if err := tc.scaleWMCODeployment(0); err != nil {
		return err
	}
	// Delete existing services CM
	err := tc.client.K8s.CoreV1().ConfigMaps(wmcoNamespace).Delete(context.TODO(), cmName, meta.DeleteOptions{})
	if err != nil {
		return err
	}

	// Generate and create a service CM with incorrect data
	invalidServicesCM, err := servicescm.Generate(cmName, wmcoNamespace,
		&servicescm.Data{Services: []servicescm.Service{{Name: "fakeservice", Bootstrap: true}},
			Files: []servicescm.FileInfo{}})
	if err != nil {
		return err
	}
	if _, err := tc.client.K8s.CoreV1().ConfigMaps(wmcoNamespace).Create(context.TODO(), invalidServicesCM,
		meta.CreateOptions{}); err != nil {
		return err
	}

	// Restart the operator pod
	if err := tc.scaleWMCODeployment(1); err != nil {
		return err
	}
	// Try to retrieve newly created ConfigMap and validate its contents
	_, err = tc.waitForValidWindowsServicesConfigMap(cmName, expected)
	if err != nil {
		return fmt.Errorf("error waiting for valid ConfigMap %s: %w", cmName, err)
	}
	return nil
}

// waitForValidWindowsServicesConfigMap returns a reference to the ConfigMap that matches the given name.
// If a ConfigMap with valid contents is not found within the time limit, an error is returned.
func (tc *testContext) waitForValidWindowsServicesConfigMap(cmName string,
	expected *servicescm.Data) (*core.ConfigMap, error) {
	configMap := &core.ConfigMap{}
	err := wait.PollImmediate(retry.Interval, retry.ResourceChangeTimeout, func() (bool, error) {
		var err error
		configMap, err = tc.client.K8s.CoreV1().ConfigMaps(wmcoNamespace).Get(context.TODO(), cmName, meta.GetOptions{})
		if err != nil {
			if apierrors.IsNotFound(err) {
				// Retry if the Get() results in a IsNotFound error
				return false, nil
			}
			return false, fmt.Errorf("error retrieving ConfigMap: %s: %w", cmName, err)
		}
		// Here, we've retrieved a ConfigMap but still need to ensure it is valid.
		// If it's not valid, retry in hopes that WMCO will replace it with a valid one as expected.
		data, err := servicescm.Parse(configMap.Data)
		if err != nil {
			log.Printf("error parsing %s data: %v", cmName, err)
			return false, nil
		}
		if err = data.ValidateExpectedContent(expected); err != nil {
			log.Printf("error validating %s data: %v", cmName, err)
			return false, nil
		}
		return true, nil
	})
	if err != nil {
		return nil, fmt.Errorf("error waiting for ConfigMap %s/%s: %w", wmcoNamespace, cmName, err)
	}
	return configMap, nil
}

// testCSRApproval tests if the BYOH CSR's have been approved by WMCO CSR approver
func (tc *testContext) testCSRApproval(t *testing.T) {
	if gc.numberOfBYOHNodes == 0 {
		t.Skip("BYOH CSR approval testing disabled")
	}
	for _, node := range gc.byohNodes {
		csrs, err := tc.findNodeCSRs(node.GetName())
		require.NotEqual(t, len(csrs), 0, "could not find BYOH node CSR's")
		require.NoError(t, err, "could not find BYOH node CSR's")

		for _, csr := range csrs {
			isWMCOApproved := func() bool {
				for _, c := range csr.Status.Conditions {
					if c.Type == certificates.CertificateApproved && c.Reason == "WMCOApprove" {
						return true
					}
				}
				return false
			}()
			assert.Equal(t, isWMCOApproved, true, "expected BYOH node %s CSR %s to be approved by WMCO CSR approver",
				node.GetName(), csr.GetName())
		}
	}

	// Revert changes to the cluster machine approver
	err := tc.scaleMachineApprover(1)
	require.NoError(t, err, "failed to scale up Cluster Machine Approver pods")
}

// findNodeCSRs returns the list of CSRs for the given node
func (tc *testContext) findNodeCSRs(nodeName string) ([]certificates.CertificateSigningRequest, error) {
	var nodeCSRs []certificates.CertificateSigningRequest
	csrs, err := tc.client.K8s.CertificatesV1().CertificateSigningRequests().List(context.TODO(),
		meta.ListOptions{})
	if err != nil {
		return nil, fmt.Errorf("unable to get CSR list: %w", err)
	}
	for _, c := range csrs.Items {
		// In some cases, a CSR is left in pending state when a new CSR is created
		// for a node too quickly before updating the status of the existing one.
		// Such a CSR cannot be approved but it does not affect node configuration
		// and is safe to be ignored.
		if c.Status.Conditions == nil || len(c.Status.Conditions) == 0 {
			continue
		}
		parsedCSR, err := validation.ParseCSR(c.Spec.Request)
		if err != nil {
			return nil, err
		}
		dnsAddr := strings.TrimPrefix(parsedCSR.Subject.CommonName, validation.NodeUserNamePrefix)
		if dnsAddr == "" {
			return nil, err
		}
		if dnsAddr == nodeName {
			nodeCSRs = append(nodeCSRs, c)
		}
	}
	return nodeCSRs, nil
}

// validateUpgradeableCondition ensures that the operator's Upgradeable condition is correctly communicated to OLM
func (tc *testContext) validateUpgradeableCondition(expected meta.ConditionStatus) error {
	ocName, err := tc.getOperatorConditionName()
	if err != nil {
		return err
	}
	err = wait.Poll(retry.Interval, retry.Timeout, func() (bool, error) {
		oc, err := tc.client.Olm.OperatorsV2().OperatorConditions(wmcoNamespace).Get(context.TODO(), ocName, meta.GetOptions{})
		if err != nil {
			log.Printf("unable to get OperatorCondition %s from namespace %s", ocName, wmcoNamespace)
			return false, nil
		}

		specCheck := condition.Validate(oc.Spec.Conditions, operators.Upgradeable, expected)
		statusCheck := condition.Validate(oc.Status.Conditions, operators.Upgradeable, expected)
		return specCheck && statusCheck, nil
	})
	if err != nil {
		return fmt.Errorf("failed to verify condition type %s has state %s: %w", operators.Upgradeable, expected, err)
	}
	return nil
}

// getOperatorConditionName returns the operator condition name using the env var present in the deployment
func (tc *testContext) getOperatorConditionName() (string, error) {
	deployment, err := tc.client.K8s.AppsV1().Deployments(wmcoNamespace).Get(context.TODO(), resourceName,
		meta.GetOptions{})
	if err != nil {
		return "", fmt.Errorf("error getting operator deployment: %w", err)
	}
	// Get the operator condition name using the deployment spec
	for _, container := range deployment.Spec.Template.Spec.Containers {
		if container.Name != wmcoContainerName {
			continue
		}
		for _, envVar := range container.Env {
			if envVar.Name == condition.OperatorConditionName {
				return envVar.Value, nil
			}
		}
	}
	return "", fmt.Errorf("unable to get operatorCondition name from namespace %s", wmcoNamespace)
}

// testNodeAnnotations tests that all required annotations are on each Windows node
func (tc *testContext) testNodeAnnotations(t *testing.T) {
	for _, node := range gc.allNodes() {
		t.Run(node.GetName(), func(t *testing.T) {
			annotations := []string{nc.HybridOverlaySubnet, nc.HybridOverlayMac, metadata.VersionAnnotation,
				nc.PubKeyHashAnnotation}
			for _, annotation := range annotations {
				assert.Contains(t, node.Annotations, annotation, "node missing expected annotation: %s", annotation)
			}

			usernameCorrect, err := tc.checkUsernameAnnotation(&node)
			require.NoError(t, err)
			assert.True(t, usernameCorrect)

			pubKey, err := tc.checkPubKeyAnnotation(&node)
			require.NoError(t, err)
			assert.True(t, pubKey)
		})
	}
}

// testNodeCloudManagerFields tests that all expected labels and fields are on each Windows node configured by
// the cloud node manager
func (tc *testContext) testNodeCloudManagerFields(t *testing.T) {
	if tc.CloudProvider.GetType() == config.NonePlatformType {
		t.Skip("skipping test as cloud provider is not set")
	}
	for _, node := range gc.allNodes() {
		t.Run(node.GetName(), func(t *testing.T) {
			// check labels
			for _, label := range tc.getExpectedCloudProviderLabels() {
				assert.Contains(t, node.Labels, label, "node missing expected label from cloud provider: %s", label)
			}
			// check providerID field
			assert.Greater(t, len(node.Spec.ProviderID), 0, "node missing expected providerID field")
		})
	}
}

// getExpectedCloudProviderLabels returns a slice of strings containing common and platform-specific cloud provider labels
func (tc *testContext) getExpectedCloudProviderLabels() []string {
	// common labels
	cloudProviderLabels := []string{core.LabelInstanceTypeStable}
	// platform-specific labels
	switch tc.CloudProvider.GetType() {
	case config.VSpherePlatformType:
		// vSphere is a special case where the topology labels are not set
	default:
		// all other supported platforms must have the topology labels
		cloudProviderLabels = append(cloudProviderLabels, core.LabelTopologyRegion, core.LabelTopologyZone)
	}
	return cloudProviderLabels
}

// checkUsernameAnnotation checks that the username annotation value is decipherable and correct
func (tc *testContext) checkUsernameAnnotation(node *core.Node) (bool, error) {
	privKey, _, err := tc.getExpectedKeyPair()
	if err != nil {
		return false, err
	}

	usernameValue, present := node.Annotations[controllers.UsernameAnnotation]
	if !present {
		return false, nil
	}
	username, err := crypto.DecryptFromJSONString(usernameValue, privKey)
	if err != nil {
		return false, err
	}
	if username != tc.vmUsername() {
		return false, nil
	}
	return true, nil
}

// checkPubKeyAnnotation that node is annotated with the public key which matches the private key used to configure it
func (tc *testContext) checkPubKeyAnnotation(node *core.Node) (bool, error) {
	_, pubKey, err := tc.getExpectedKeyPair()
	if err != nil {
		return false, err
	}

	pubKeyAnnotation := nc.CreatePubKeyHashAnnotation(pubKey)
	if pubKeyAnnotation != node.Annotations[nc.PubKeyHashAnnotation] {
		return false, nil
	}
	return true, nil
}

// testDependentServiceChanges tests that a Windows service which a running service is dependent on can be reconfigured
func testDependentServiceChanges(t *testing.T) {
	tc, err := NewTestContext()
	require.NoError(t, err)
	err = tc.waitForConfiguredWindowsNodes(gc.numberOfMachineNodes, false, false)
	require.NoError(t, err, "timed out waiting for Windows Machine nodes")
	err = tc.waitForConfiguredWindowsNodes(gc.numberOfBYOHNodes, false, true)
	require.NoError(t, err, "timed out waiting for BYOH Windows nodes")
	nodes := append(gc.machineNodes, gc.byohNodes...)

	queryCommand := "Get-WmiObject win32_service | Where-Object {$_.Name -eq \\\"hybrid-overlay-node\\\"} " +
		"|select -ExpandProperty PathName"
	for _, node := range nodes {
		t.Run(node.GetName(), func(t *testing.T) {
			// Get initial configuration of hybrid-overlay-node, this service is used as kube-proxy is dependent on it
			addr, err := controllers.GetAddress(node.Status.Addresses)
			require.NoError(t, err, "error getting node address")
			out, err := tc.runPowerShellSSHJob("hybrid-overlay-query", queryCommand, addr)
			require.NoError(t, err, "error querying hybrid-overlay service")
			// The binPath/pathName will be the final line of the pod logs
			originalPath := finalLine(out)

			// Change hybrid-overlay-node configuration
			newPath, err := changeHybridOverlayCommandVerbosity(originalPath)
			require.NoError(t, err, "error constructing new hybrid-overlay command")
			changeCommand := fmt.Sprintf("sc.exe config hybrid-overlay-node binPath=\\\"%s\\\"", newPath)
			out, err = tc.runPowerShellSSHJob("hybrid-overlay-change", changeCommand, addr)
			require.NoError(t, err, "error changing hybrid-overlay command")

			// Wait until hybrid-overlay-node is returned to correct config
			err = wait.Poll(retry.Interval, retry.Timeout, func() (bool, error) {
				out, err = tc.runPowerShellSSHJob("hybrid-overlay-query2", queryCommand, addr)
				if err != nil {
					return false, nil
				}
				currentPath := finalLine(out)
				if currentPath != originalPath {
					log.Printf("waiting for hybrid-overlay service config to be reconciled, current value: %s",
						currentPath)
					return false, nil
				}
				return true, nil
			})
			require.NoError(t, err)
		})
	}
}

// logLevelRegex finds the loglevel argument and captures the log level itself
var logLevelRegex = regexp.MustCompile(`loglevel(?:=|\s)(\d+\.?\d*)`)

// changeHybridOverlayCommandVerbosity will change the loglevel argument in the hybrid-overlay-node command to a
// different level. It takes the full command path of the hybrid-overlay-node service, and returns the command with
// altered arguments.
func changeHybridOverlayCommandVerbosity(in string) (string, error) {
	matches := logLevelRegex.FindStringSubmatch(in)
	if len(matches) != 2 {
		return "", fmt.Errorf("'%s' did not match expected argument format", in)
	}
	originalLogLevel, err := strconv.Atoi(matches[1])
	if err != nil {
		return "", fmt.Errorf("could not convert %s into int: %w", matches[1], err)
	}
	var newLogLevel string
	if originalLogLevel == 0 {
		newLogLevel = fmt.Sprintf("%d", originalLogLevel+1)
	} else {
		newLogLevel = fmt.Sprintf("%d", originalLogLevel-1)
	}
	return logLevelRegex.ReplaceAllString(in, "loglevel "+newLogLevel), nil
}

// finalLine returns the contents of the final line of a given string
func finalLine(s string) string {
	lineSplit := strings.Split(strings.TrimSpace(s), "\n")
	return strings.TrimSpace(lineSplit[len(lineSplit)-1])
}