-
Notifications
You must be signed in to change notification settings - Fork 23
WIP: OCPBUGS-61756: ctrl: kubeletConfig: avoid updating ConfigMap on Paused MCP #2261
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 2 commits
1520766
72a8cac
38d6a91
b08d423
22533cb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,6 +48,7 @@ import ( | |
| "github.com/k8stopologyawareschedwg/deployer/pkg/deployer/platform" | ||
|
|
||
| nropv1 "github.com/openshift-kni/numaresources-operator/api/v1" | ||
| intkubeletconfig "github.com/openshift-kni/numaresources-operator/internal/kubeletconfig" | ||
| "github.com/openshift-kni/numaresources-operator/internal/machineconfigpools" | ||
| "github.com/openshift-kni/numaresources-operator/pkg/apply" | ||
| "github.com/openshift-kni/numaresources-operator/pkg/kubeletconfig" | ||
|
|
@@ -57,7 +58,8 @@ import ( | |
| ) | ||
|
|
||
| const ( | ||
| kubeletConfigRetryPeriod = 30 * time.Second | ||
| kubeletConfigRetryPeriod = 30 * time.Second | ||
| MachineConfigPoolPausedRetryPeriod = 2 * time.Minute | ||
| ) | ||
|
|
||
| const ( | ||
|
|
@@ -87,6 +89,12 @@ type kubeletConfigHandler struct { | |
| setCtrlRef func(owner, controlled metav1.Object, scheme *runtime.Scheme, opts ...controllerutil.OwnerReferenceOption) error | ||
| } | ||
|
|
||
| type reconcileErrorHandler struct { | ||
| err error | ||
| tolerateError bool | ||
| result ctrl.Result | ||
| } | ||
|
|
||
| // Namespace Scoped | ||
|
|
||
| // Cluster Scoped | ||
|
|
@@ -116,22 +124,27 @@ func (r *KubeletConfigReconciler) Reconcile(ctx context.Context, req ctrl.Reques | |
|
|
||
| // KubeletConfig changes are expected to be sporadic, yet are important enough | ||
| // to be made visible at kubernetes level. So we generate events to handle them | ||
| cm, err := r.reconcileConfigMap(ctx, instance, req.NamespacedName) | ||
| if err != nil { | ||
| cm, errHandler := r.reconcileConfigMap(ctx, instance, req.NamespacedName) | ||
| if errHandler.err != nil { | ||
| var klErr *InvalidKubeletConfig | ||
| if errors.As(err, &klErr) { | ||
| if errors.As(errHandler.err, &klErr) { | ||
| r.Recorder.Event(instance, "Normal", "ProcessSkip", "ignored kubelet config "+klErr.ObjectName) | ||
| return ctrl.Result{}, nil | ||
| return errHandler.result, nil | ||
| } | ||
| if errHandler.tolerateError { | ||
| r.Recorder.Event(instance, "Normal", "ProcessSkip", errHandler.err.Error()) | ||
| return errHandler.result, nil | ||
| } | ||
|
|
||
| klog.ErrorS(err, "failed to reconcile configmap", "controller", "kubeletconfig") | ||
| klog.ErrorS(errHandler.err, "failed to reconcile configmap", "controller", "kubeletconfig") | ||
|
|
||
| r.Recorder.Event(instance, "Warning", "ProcessFailed", "Failed to update RTE config from kubelet config "+req.NamespacedName.String()) | ||
| return ctrl.Result{}, err | ||
| return errHandler.result, errHandler.err | ||
| } | ||
|
|
||
| r.Recorder.Event(instance, "Normal", "ProcessOK", fmt.Sprintf("Updated RTE config %s/%s from kubelet config %s", cm.Namespace, cm.Name, req.NamespacedName.String())) | ||
| return ctrl.Result{}, nil | ||
| //return ctrl.Result{}, nil | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. leftovers I presume |
||
| return errHandler.result, nil | ||
| } | ||
|
|
||
| func (r *KubeletConfigReconciler) SetupWithManager(mgr ctrl.Manager) error { | ||
|
|
@@ -197,25 +210,29 @@ func (e *InvalidKubeletConfig) Unwrap() error { | |
| return e.Err | ||
| } | ||
|
|
||
| func (r *KubeletConfigReconciler) reconcileConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, error) { | ||
| func (r *KubeletConfigReconciler) reconcileConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, reconcileErrorHandler) { | ||
| // first check if the ConfigMap should be deleted | ||
| // to save all the additional work related for create/update | ||
| cm, deleted, err := r.deleteConfigMap(ctx, instance, kcKey) | ||
| if deleted { | ||
| return cm, err | ||
| return cm, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| kcHandler, err := r.makeKCHandlerForPlatform(ctx, instance, kcKey) | ||
| if err != nil { | ||
| return nil, err | ||
| kcHandler, errHandler := r.makeKCHandlerForPlatform(ctx, instance, kcKey) | ||
| if errHandler.err != nil { | ||
| return nil, errHandler | ||
| } | ||
| kubeletConfig, err := kubeletconfig.MCOKubeletConfToKubeletConf(kcHandler.mcoKc) | ||
| if err != nil { | ||
| klog.ErrorS(err, "cannot extract KubeletConfiguration from MCO KubeletConfig", "name", kcKey.Name) | ||
| return nil, err | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| return r.syncConfigMap(ctx, kubeletConfig, instance, kcHandler) | ||
| cm, err = r.syncConfigMap(ctx, kubeletConfig, instance, kcHandler) | ||
| if err != nil { | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
| return cm, errHandler // FIXME use predicate | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason why not to implement predicate in this PR already? it's not suppose to be lot of code, so I guess we can squeeze it here as another small commit. |
||
| } | ||
|
|
||
| func (r *KubeletConfigReconciler) syncConfigMap(ctx context.Context, kubeletConfig *kubeletconfigv1beta1.KubeletConfiguration, instance *nropv1.NUMAResourcesOperator, kcHandler *kubeletConfigHandler) (*corev1.ConfigMap, error) { | ||
|
|
@@ -244,63 +261,128 @@ func (r *KubeletConfigReconciler) syncConfigMap(ctx context.Context, kubeletConf | |
| return rendered, nil | ||
| } | ||
|
|
||
| func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*kubeletConfigHandler, error) { | ||
| func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*kubeletConfigHandler, reconcileErrorHandler) { | ||
| switch r.Platform { | ||
| case platform.OpenShift: | ||
| mcoKc := &mcov1.KubeletConfig{} | ||
| if err := r.Client.Get(ctx, kcKey, mcoKc); err != nil { | ||
| return nil, err | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| mcps, err := machineconfigpools.GetListByNodeGroupsV1(ctx, r.Client, instance.Spec.NodeGroups) | ||
| if err != nil { | ||
| return nil, err | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| mcp, err := machineconfigpools.FindBySelector(mcps, mcoKc.Spec.MachineConfigPoolSelector) | ||
| if err != nil { | ||
| klog.ErrorS(err, "cannot find a matching mcp for MCO KubeletConfig", "name", kcKey.Name) | ||
| var notFound *machineconfigpools.NotFound | ||
| if errors.As(err, ¬Found) { | ||
| return nil, &InvalidKubeletConfig{ | ||
| return nil, reconcileErrorHandler{err: &InvalidKubeletConfig{ | ||
| ObjectName: kcKey.Name, | ||
| Err: notFound, | ||
| } | ||
| }} | ||
| } | ||
| return nil, err | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| klog.V(3).InfoS("matched MCP to MCO KubeletConfig", "kubeletconfig name", kcKey.Name, "MCP name", mcp.Name) | ||
|
|
||
| // nothing we care about, and we can't do much anyway | ||
| if mcoKc.Spec.KubeletConfig == nil { | ||
| klog.InfoS("detected KubeletConfig with empty payload, ignoring", "name", kcKey.Name) | ||
| return nil, &InvalidKubeletConfig{ObjectName: kcKey.Name} | ||
| return nil, reconcileErrorHandler{err: &InvalidKubeletConfig{ObjectName: kcKey.Name}} | ||
| } | ||
|
|
||
| if mcp.Spec.Paused { | ||
| klog.InfoS("detected paused MCP", "name", mcp.Name) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this will be printed at every reconciliation step it seems. Let's make it V(4) |
||
| //if the CM exists -> just skip; | ||
| //if the CM does not exist -> create it based on the current active machineConfig | ||
|
shajmakh marked this conversation as resolved.
|
||
|
|
||
| expectedCMName := objectnames.GetComponentName(instance.Name, mcp.Name) | ||
| existingCM := &corev1.ConfigMap{} | ||
| if err := r.Client.Get(ctx, client.ObjectKey{Namespace: r.Namespace, Name: expectedCMName}, existingCM); err != nil { | ||
| if !apierrors.IsNotFound(err) { | ||
| return nil, reconcileErrorHandler{ | ||
| err: err, | ||
| tolerateError: true, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we need to support the operation even when MCP are paused, why these errors are tolerate? |
||
| } | ||
| } | ||
|
|
||
| currentConfigName := mcp.Status.Configuration.Name | ||
| currentConfigObj := &mcov1.MachineConfig{} | ||
| if err := r.Client.Get(ctx, client.ObjectKey{Name: currentConfigName}, currentConfigObj); err != nil { | ||
| klog.ErrorS(err, "cannot find the machineConfig", "name", currentConfigName) | ||
| return nil, reconcileErrorHandler{ | ||
| err: fmt.Errorf("failed to find the current machineConfig %s: %v", currentConfigName, err), | ||
| tolerateError: true, | ||
| result: ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}, | ||
| } | ||
| } | ||
|
|
||
| // use local version of github.com/openshift/machine-config-operator/pkg/controller/common.ParseAndConvertConfig | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is just narrating in english what the code does. Either give context about why or remove the comment |
||
| _, dataInBytes, err := intkubeletconfig.ParseKubeletConfigRawData(currentConfigObj.Spec.Config.Raw) | ||
| if err != nil { | ||
| klog.ErrorS(err, "cannot parse the machineConfig", "name", currentConfigName) | ||
| return nil, reconcileErrorHandler{ | ||
| err: fmt.Errorf("failed to parse the machineConfig %s: %v", currentConfigName, err), | ||
| tolerateError: true, | ||
| result: ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}, | ||
| } | ||
| } | ||
|
|
||
| decodeKc, err := intkubeletconfig.DecodeKubeletConfigurationFromData(dataInBytes) | ||
| if err != nil { | ||
| klog.ErrorS(err, "cannot decode the KubeletConfig data from MachineConfig", "name", currentConfigName) | ||
| return nil, reconcileErrorHandler{ | ||
| err: fmt.Errorf("failed to decode the KubeletConfig data from MachineConfig %s: %v", currentConfigName, err), | ||
| tolerateError: true, | ||
| result: ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}, | ||
| } | ||
| } | ||
|
|
||
| return &kubeletConfigHandler{ | ||
| ownerObject: decodeKc, | ||
| mcoKc: decodeKc, | ||
| poolName: mcp.Name, | ||
| setCtrlRef: controllerutil.SetControllerReference, | ||
| }, reconcileErrorHandler{result: ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}} | ||
| } | ||
|
|
||
| klog.InfoS("MachineConfigPool of KubeletConfig %s is paused and configMap %s exists", kcKey.Name, existingCM.Name) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please check verbosity to avoid log spam |
||
| return nil, reconcileErrorHandler{ | ||
| // the KubeletConfig has been already handled and we can skip the rest of reconciliation logic due to paused MCP | ||
| err: fmt.Errorf("MachineConfigPool of KubeletConfig %s is paused and configMap %s already exists", kcKey.Name, existingCM.Name), | ||
| tolerateError: true, | ||
| result: ctrl.Result{Requeue: true, RequeueAfter: MachineConfigPoolPausedRetryPeriod}, | ||
| } | ||
| } | ||
|
|
||
| return &kubeletConfigHandler{ | ||
| ownerObject: mcoKc, | ||
| mcoKc: mcoKc, | ||
| poolName: mcp.Name, | ||
| setCtrlRef: controllerutil.SetControllerReference, | ||
| }, nil | ||
| }, reconcileErrorHandler{} | ||
|
|
||
| case platform.HyperShift: | ||
| cmKc := &corev1.ConfigMap{} | ||
| if err := r.Client.Get(ctx, kcKey, cmKc); err != nil { | ||
| return nil, err | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| nodePoolName := cmKc.Labels[HyperShiftNodePoolLabel] | ||
| kcData := cmKc.Data[HyperShiftConfigMapConfigKey] | ||
| mcoKc, err := kubeletconfig.DecodeFromData([]byte(kcData), r.Scheme) | ||
| if err != nil { | ||
| return nil, err | ||
| return nil, reconcileErrorHandler{err: err} | ||
| } | ||
|
|
||
| // nothing we care about, and we can't do much anyway | ||
| if mcoKc.Spec.KubeletConfig == nil { | ||
| klog.InfoS("detected KubeletConfig with empty payload, ignoring", "name", kcKey.Name) | ||
| return nil, &InvalidKubeletConfig{ObjectName: kcKey.Name} | ||
| return nil, reconcileErrorHandler{err: &InvalidKubeletConfig{ObjectName: kcKey.Name}} | ||
| } | ||
| return &kubeletConfigHandler{ | ||
| ownerObject: cmKc, | ||
|
|
@@ -312,9 +394,9 @@ func (r *KubeletConfigReconciler) makeKCHandlerForPlatform(ctx context.Context, | |
| setCtrlRef: func(owner, controlled metav1.Object, scheme *runtime.Scheme, opts ...controllerutil.OwnerReferenceOption) error { | ||
| return nil | ||
| }, | ||
| }, nil | ||
| }, reconcileErrorHandler{} | ||
| } | ||
| return nil, fmt.Errorf("unsupported platform: %s", r.Platform) | ||
| return nil, reconcileErrorHandler{err: fmt.Errorf("unsupported platform: %s", r.Platform)} | ||
| } | ||
|
|
||
| func (r *KubeletConfigReconciler) deleteConfigMap(ctx context.Context, instance *nropv1.NUMAResourcesOperator, kcKey client.ObjectKey) (*corev1.ConfigMap, bool, error) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is not a handler. a Handler manages something. This seems more a
reconcileResultThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we reuse a
reconcile.Step, extend it, or even create a variant of?