From 57bcf468bda7ceab71ff3fc2de10bf995a07c58e Mon Sep 17 00:00:00 2001 From: dkeven Date: Thu, 21 May 2026 13:42:53 +0800 Subject: [PATCH] refactor(scheduler): get rid of app-level GPU management --- .../crds/gpu.bytetrade.io_gpubindings.yaml | 4 + cmd/scheduler/main.go | 9 +- pkg/api/gpu/v1alpha1/gpubinding_types.go | 2 + pkg/scheduler/routes/gpu_manage.go | 867 ------------------ pkg/scheduler/scheduler.go | 400 +------- pkg/util/types.go | 3 +- 6 files changed, 56 insertions(+), 1229 deletions(-) delete mode 100644 pkg/scheduler/routes/gpu_manage.go diff --git a/charts/hami/crds/gpu.bytetrade.io_gpubindings.yaml b/charts/hami/crds/gpu.bytetrade.io_gpubindings.yaml index 41f9b12e6..223aca385 100644 --- a/charts/hami/crds/gpu.bytetrade.io_gpubindings.yaml +++ b/charts/hami/crds/gpu.bytetrade.io_gpubindings.yaml @@ -45,6 +45,10 @@ spec: - type: string pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ x-kubernetes-int-or-string: true + namespace: + type: string + owner: + type: string podSelector: description: |- A label selector is a label query over a set of resources. The result of matchLabels and diff --git a/cmd/scheduler/main.go b/cmd/scheduler/main.go index 1af9f4c5d..66585fedf 100644 --- a/cmd/scheduler/main.go +++ b/cmd/scheduler/main.go @@ -75,7 +75,7 @@ func init() { rootCmd.Flags().IntVar(&config.Timeout, "kube-timeout", client.DefaultTimeout, "Timeout to use while talking with kube-apiserver.") rootCmd.Flags().BoolVar(&enableProfiling, "profiling", false, "Enable pprof profiling via HTTP server") rootCmd.Flags().DurationVar(&config.NodeLockTimeout, "node-lock-timeout", time.Minute*5, "timeout for node locks") - rootCmd.Flags().DurationVar(&config.CleanupStartupDelay, "cleanup-startup-delay", 90*time.Second, "delay before starting cleanup loops (CleanupGPUBindingsLoop/CleanupPodsWithMissingDevicesLoop)") + rootCmd.Flags().DurationVar(&config.CleanupStartupDelay, "cleanup-startup-delay", 90*time.Second, "delay before starting cleanup loops (CleanupPodsWithMissingDevicesLoop)") rootCmd.Flags().BoolVar(&config.ForceOverwriteDefaultScheduler, "force-overwrite-default-scheduler", true, "Overwrite schedulerName in Pod Spec when set to the const DefaultSchedulerName in https://k8s.io/api/core/v1 package") rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet()) @@ -121,7 +121,6 @@ func start() error { // start monitor metrics go sher.RegisterFromNodeAnnotations() - go sher.CleanupGPUBindingsLoop() go sher.CleanupPodsWithMissingDevicesLoop() go initMetrics(config.MetricsBindAddress) @@ -132,12 +131,6 @@ func start() error { router.POST("/webhook", routes.WebHookRoute()) router.GET("/healthz", routes.HealthzRoute()) - router.GET("/gpus", routes.ListGPUDetails(sher)) - router.PUT("/gpus/assignments/bulk", routes.BulkManageAssignments(sher)) - router.POST("/gpus/:id/mode", routes.SwitchGPUMode(sher)) - router.POST("/gpus/:id/assign", routes.AssignGPUToApp(sher)) - router.POST("/gpus/:id/unassign", routes.UnassignGPUFromApp(sher)) - klog.Info("listen on ", config.HTTPBind) if enableProfiling { diff --git a/pkg/api/gpu/v1alpha1/gpubinding_types.go b/pkg/api/gpu/v1alpha1/gpubinding_types.go index 46379e952..bda53646d 100644 --- a/pkg/api/gpu/v1alpha1/gpubinding_types.go +++ b/pkg/api/gpu/v1alpha1/gpubinding_types.go @@ -24,6 +24,8 @@ import ( type GPUBindingSpec struct { UUID string `json:"uuid"` AppName string `json:"appName"` + Owner string `json:"owner,omitempty"` + Namespace string `json:"namespace,omitempty"` PodSelector *metav1.LabelSelector `json:"podSelector,omitempty"` Memory *resource.Quantity `json:"memory,omitempty"` } diff --git a/pkg/scheduler/routes/gpu_manage.go b/pkg/scheduler/routes/gpu_manage.go deleted file mode 100644 index e5fa2ceea..000000000 --- a/pkg/scheduler/routes/gpu_manage.go +++ /dev/null @@ -1,867 +0,0 @@ -package routes - -import ( - "encoding/json" - "fmt" - "net/http" - "slices" - "sort" - "strings" - "time" - - "github.com/Project-HAMi/HAMi/pkg/util/client" - ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/julienschmidt/httprouter" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/resource" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/klog/v2" - - "github.com/Project-HAMi/HAMi/pkg/api/gpu/v1alpha1" - "github.com/Project-HAMi/HAMi/pkg/scheduler" - "github.com/Project-HAMi/HAMi/pkg/util" -) - -type GPUInfo struct { - NodeName string `json:"nodeName"` - util.DeviceInfo -} - -type GPUAppInfo struct { - AppName string `json:"appName"` - Memory *int64 `json:"memory,omitempty"` -} - -type GPUDetail struct { - GPUInfo - AllowedShareModes []string `json:"allowedShareModes,omitempty"` - Apps []GPUAppInfo `json:"apps"` - MemoryAllocated *int64 `json:"memoryAllocated,omitempty"` - MemoryAvailable *int64 `json:"memoryAvailable,omitempty"` -} - -type AssignGPURequest struct { - AppName string `json:"appName"` - Memory *resource.Quantity `json:"memory,omitempty"` -} - -type SwitchModeRequest struct { - Mode string `json:"mode"` -} - -type UnassignGPURequest struct { - AppName string `json:"appName"` -} - -type SwitchAssignItem struct { - ID string `json:"id"` - Memory *resource.Quantity `json:"memory,omitempty"` -} - -type SwitchAssignRequest struct { - AppName string `json:"appName"` - Unassign []SwitchAssignItem `json:"unassign,omitempty"` - Assign []SwitchAssignItem `json:"assign,omitempty"` -} - -func ListGPUInfos(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { - klog.Infoln("Listing all GPUs") - util.GPUManageLock.Lock() - defer util.GPUManageLock.Unlock() - - nodes, err := s.ListNodes() - if err != nil { - klog.Errorln(err) - http.Error(w, fmt.Sprintf("failed to list nodes: %v", err), http.StatusInternalServerError) - return - } - - var gpus []GPUInfo - for _, node := range nodes { - for _, device := range node.Devices { - gpu := GPUInfo{ - NodeName: node.Node.Name, - DeviceInfo: device, - } - gpus = append(gpus, gpu) - } - } - - w.Header().Set("Content-Type", "application/json") - err = json.NewEncoder(w).Encode(gpus) - if err != nil { - klog.Errorf("failed to encode response: %v", err) - } - } -} - -func ListGPUDetails(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { - util.GPUManageLock.Lock() - defer util.GPUManageLock.Unlock() - - nodes, err := s.ListNodes() - if err != nil { - klog.Errorln(err) - http.Error(w, fmt.Sprintf("failed to list nodes: %v", err), http.StatusInternalServerError) - return - } - - uuidToGPUDetails := make(map[string]*GPUDetail) - - for _, node := range nodes { - for _, device := range node.Devices { - allowedShareModes := util.DefaultAllowedShareModes - config, ok := util.GetCompatibleConfigsByDeviceName(device.Type) - if ok && len(config.AllowedShareModes) > 0 { - allowedShareModes = config.AllowedShareModes - } - uuidToGPUDetails[device.ID] = &GPUDetail{ - GPUInfo: GPUInfo{ - NodeName: node.Node.Name, - DeviceInfo: device, - }, - AllowedShareModes: allowedShareModes, - } - } - } - - bindings, err := s.ListGPUBindings() - if err != nil { - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - for _, binding := range bindings { - gpuDetail := uuidToGPUDetails[binding.Spec.UUID] - if gpuDetail == nil { - continue - } - appInfo := GPUAppInfo{ - AppName: binding.Spec.AppName, - } - if binding.Spec.Memory != nil { - mem := binding.Spec.Memory.Value() - appInfo.Memory = &mem - } - gpuDetail.Apps = append(gpuDetail.Apps, appInfo) - } - - for _, gpuDetail := range uuidToGPUDetails { - if gpuDetail.ShareMode == util.ShareModeMemSlicing { - var allocated, available int64 - for _, app := range gpuDetail.Apps { - // normally this check should always equal to true - if app.Memory != nil { - allocated += *app.Memory - } - } - available = int64(gpuDetail.Devmem) - allocated - // this should never happen - if available < 0 { - klog.Errorf("error state: GPU %s's allocated memory %d execeeds its total memory %d", gpuDetail.ID, allocated, gpuDetail.Devmem) - available = 0 - } - gpuDetail.MemoryAllocated = &allocated - gpuDetail.MemoryAvailable = &available - } - } - - gpuDetails := make([]GPUDetail, 0) - for _, gpuDetail := range uuidToGPUDetails { - gpuDetails = append(gpuDetails, *gpuDetail) - } - - sort.SliceStable(gpuDetails, func(i, j int) bool { - return gpuDetails[i].NodeName < gpuDetails[j].NodeName - }) - sort.SliceStable(gpuDetails, func(i, j int) bool { - return gpuDetails[i].ID < gpuDetails[j].ID - }) - - w.Header().Set("Content-Type", "application/json") - err = json.NewEncoder(w).Encode(gpuDetails) - if err != nil { - klog.Errorf("failed to encode response: %v", err) - } - } -} - -func AssignGPUToApp(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { - var req AssignGPURequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - klog.Errorln(err) - http.Error(w, fmt.Sprintf("failed to decode request: %v", err), http.StatusBadRequest) - return - } - uuid := ps.ByName("id") - - if uuid == "" || req.AppName == "" { - http.Error(w, "UUID and AppName are required", http.StatusBadRequest) - return - } - - klog.Infof("Assigning GPU %s to app %s", uuid, req.AppName) - util.GPUManageLock.Lock() - defer util.GPUManageLock.Unlock() - - nodes, err := s.ListNodes() - if err != nil { - klog.Errorln(err) - http.Error(w, fmt.Sprintf("failed to list nodes: %v", err), http.StatusInternalServerError) - return - } - - var targetDevice *util.DeviceInfo - var targetNodeName string - for _, node := range nodes { - for _, device := range node.Devices { - if device.ID == uuid { - targetDevice = &device - targetNodeName = node.Node.Name - break - } - } - if targetDevice != nil { - break - } - } - - if targetDevice == nil { - http.Error(w, fmt.Sprintf("GPU %s not found", uuid), http.StatusNotFound) - return - } - - bindings, err := s.ListGPUBindings() - if err != nil { - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - var existingBinding *v1alpha1.GPUBinding - - // validate node consistency for multi-binding: an app cannot bind GPUs across different nodes - uuidToNodeName := make(map[string]string) - for _, node := range nodes { - for _, device := range node.Devices { - uuidToNodeName[device.ID] = node.Node.Name - } - } - for _, binding := range bindings { - if binding.Spec.AppName != req.AppName { - continue - } - if binding.Spec.UUID == uuid { - existingBinding = binding - } - existingNode := uuidToNodeName[binding.Spec.UUID] - if existingNode != "" && existingNode != targetNodeName { - err = fmt.Errorf("app %s already has GPUBinding on node %s, requested GPU is on node %s; cross-node multi-binding is not allowed", req.AppName, existingNode, targetNodeName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusConflict) - return - } - } - - if existingBinding != nil { - klog.Warningf("Attempting to assign app %s to already bound GPU %s in mode %s", req.AppName, uuid, targetDevice.ShareMode) - if targetDevice.ShareMode != util.ShareModeMemSlicing { - w.WriteHeader(http.StatusOK) - return - } - if req.Memory == nil || req.Memory.Value() == 0 || req.Memory.Value() == existingBinding.Spec.Memory.Value() { - w.WriteHeader(http.StatusOK) - return - } - } - - if targetDevice.ShareMode != util.ShareModeMemSlicing && req.Memory != nil { - klog.Warningf("Attempting to request memory %d when assigning app %s to GPU %s that's not in memory slicing mode, clearing ...", req.Memory.Value(), req.AppName, uuid) - req.Memory = nil - } - - // if card is in exclusive mode, force out any already assigned app - if targetDevice.ShareMode == util.ShareModeExclusive { - pods := s.ListPodsInfo() - for _, pod := range pods { - for _, pdev := range pod.Devices { - for _, cdevs := range pdev { - for _, cdev := range cdevs { - if cdev.UUID == uuid { - klog.Infof("Forcing out pod %s/%s of exclusive GPU %s in favor of %s", pod.Namespace, pod.Name, uuid, req.AppName) - err = s.DeletePodFromCluster(r.Context(), s.PodInfoToPodObj(pod)) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - } - } - } - for _, binding := range bindings { - if binding.Spec.UUID == uuid && binding.Spec.AppName != req.AppName { - if err := ctrlclient.IgnoreNotFound(util.DeleteGPUBinding(r.Context(), binding.Name)); err != nil { - err = fmt.Errorf("failed to delete existing GPUBinding %s: %v", binding.Name, err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - } - - if targetDevice.ShareMode == util.ShareModeMemSlicing { - if req.Memory == nil || req.Memory.Value() == 0 { - err = fmt.Errorf("memory allocation is required for GPU %s in memory slicing mode, refuse assigning to app %s", uuid, req.AppName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusBadRequest) - return - } - - totalUsedMem := int64(0) - for _, binding := range bindings { - - // don't count the memory already allocated to this app - if binding.Spec.AppName == req.AppName { - continue - } - if binding.Spec.UUID == uuid { - // normally this check should always equal to true - if binding.Spec.Memory != nil { - totalUsedMem += binding.Spec.Memory.Value() - } - } - } - - if totalUsedMem+req.Memory.Value() > int64(targetDevice.Devmem) { - err = fmt.Errorf("not enough memory available on GPU %s, available: %d, request: %d, refuse assigning to app %s", uuid, int64(targetDevice.Devmem)-totalUsedMem, req.Memory.Value(), req.AppName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusConflict) - return - } - - if existingBinding != nil { - newBinding := existingBinding.DeepCopy() - newBinding.Spec.Memory = req.Memory - err = client.GPUClient.Patch(r.Context(), newBinding, ctrlclient.MergeFrom(existingBinding)) - if err != nil { - err = fmt.Errorf("failed to patch GPUBinding %s: %v", existingBinding.Name, err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - - // delete existing pods for this app - err = s.DeletePodsBelongToApp(r.Context(), req.AppName) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - if existingBinding != nil { - w.WriteHeader(http.StatusOK) - return - } - - newBinding := &v1alpha1.GPUBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: strings.ToLower(fmt.Sprintf("%s-%s-%d", req.AppName, uuid, time.Now().Unix())), - }, - Spec: v1alpha1.GPUBindingSpec{ - UUID: uuid, - AppName: req.AppName, - PodSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - util.AppNameLabelKey: req.AppName, - }, - }, - Memory: req.Memory, - }, - } - - if err := s.CreateGPUBinding(r.Context(), newBinding); err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - w.WriteHeader(http.StatusOK) - } -} - -func SwitchGPUMode(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { - var req SwitchModeRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - http.Error(w, fmt.Sprintf("failed to decode request: %v", err), http.StatusBadRequest) - return - } - uuid := ps.ByName("id") - - if uuid == "" || req.Mode == "" { - http.Error(w, "ID and Mode are required", http.StatusBadRequest) - return - } - - if req.Mode != util.ShareModeExclusive && req.Mode != util.ShareModeMemSlicing && req.Mode != util.ShareModeTimeSlicing { - http.Error(w, "invalid share mode", http.StatusBadRequest) - return - } - - klog.Infof("Switching GPU %s to mode %s", uuid, req.Mode) - util.GPUManageLock.Lock() - defer util.GPUManageLock.Unlock() - - nodes, err := s.ListNodes() - if err != nil { - klog.Errorln(err) - http.Error(w, fmt.Sprintf("failed to list nodes: %v", err), http.StatusInternalServerError) - return - } - - var targetNode *corev1.Node - for _, node := range nodes { - for _, device := range node.Devices { - if device.ID == uuid { - config, ok := util.GetCompatibleConfigsByDeviceName(device.Type) - if ok && len(config.AllowedShareModes) > 0 { - if !slices.Contains(config.AllowedShareModes, req.Mode) { - klog.Warningf("GPU %s does not support mode %s, refusing to switch", uuid, req.Mode) - http.Error(w, fmt.Sprintf("GPU %s does not support mode %s", uuid, req.Mode), http.StatusBadRequest) - return - } - } - targetNode = node.Node - break - } - } - if targetNode != nil { - break - } - } - - if targetNode == nil { - http.Error(w, fmt.Sprintf("GPU %s not found", uuid), http.StatusNotFound) - return - } - - // delete all pods bound to this GPU - pods := s.ListPodsInfo() - for _, pod := range pods { - for _, pdev := range pod.Devices { - for _, cdevs := range pdev { - for _, cdev := range cdevs { - if cdev.UUID == uuid { - klog.Infof("Deleting pod %s/%s for mode switch of GPU %s", pod.Namespace, pod.Name, uuid) - err = s.DeletePodFromCluster(r.Context(), s.PodInfoToPodObj(pod)) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - } - } - } - - // delete all existing GPUBindings of this GPU - bindings, err := s.ListGPUBindings() - if err != nil { - klog.Error(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - for _, binding := range bindings { - if binding.Spec.UUID == uuid { - if err := ctrlclient.IgnoreNotFound(util.DeleteGPUBinding(r.Context(), binding.Name)); err != nil { - err = fmt.Errorf("failed to delete existing GPUBinding %s: %v", binding.Name, err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - - patchAnnotations := make(map[string]string) - patchAnnotations[fmt.Sprintf(util.ShareModeAnnotationTpl, uuid)] = req.Mode - if err := util.PatchNodeAnnotations(targetNode, patchAnnotations); err != nil { - err = fmt.Errorf("failed to patch node annotations: %v", err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - // update scheduler's in-memory knowledge of the device - // because the update operation in the scheduler's watch loop - // triggered by node update event has a significant delay - if err := s.UpdateDeviceShareMode(uuid, req.Mode); err != nil { - err = fmt.Errorf("failed to update device share mode: %v", err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - w.WriteHeader(http.StatusOK) - } -} - -func UnassignGPUFromApp(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, ps httprouter.Params) { - var req UnassignGPURequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - http.Error(w, fmt.Sprintf("failed to decode request: %v", err), http.StatusBadRequest) - return - } - uuid := ps.ByName("id") - - if uuid == "" || req.AppName == "" { - http.Error(w, "UUID and AppName are required", http.StatusBadRequest) - return - } - - klog.Infof("Unassigning GPU %s from app %s", uuid, req.AppName) - util.GPUManageLock.Lock() - defer util.GPUManageLock.Unlock() - - bindings, err := s.ListGPUBindings() - if err != nil { - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - toDelete := make([]string, 0) - for _, binding := range bindings { - if binding.Spec.UUID == uuid && binding.Spec.AppName == req.AppName { - toDelete = append(toDelete, binding.Name) - } - } - - if len(toDelete) == 0 { - w.WriteHeader(http.StatusOK) - return - } - - if err := ctrlclient.IgnoreNotFound(s.DeletePodsBelongToApp(r.Context(), req.AppName)); err != nil { - klog.Errorln(fmt.Errorf("failed to delete pods of app %s: %v", req.AppName, err)) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - for _, name := range toDelete { - if err := ctrlclient.IgnoreNotFound(util.DeleteGPUBinding(r.Context(), name)); err != nil { - klog.Errorln(fmt.Errorf("failed to delete GPUBinding %s: %v", name, err)) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - - w.WriteHeader(http.StatusOK) - } -} - -// SwitchAssign performs an atomic switch of GPU assignments for an app: -// - Unassigns specified GPU IDs if currently bound to the app (ignores non-existent bindings) -// - Assigns specified GPU IDs (with optional memory for mem-slicing mode) to the app -// - Enforces single-node binding policy across the app's final bindings -// - For exclusive GPUs, evicts existing app bindings and restarts their pods -// - Restarts the target app's pods only if its binding relationship changes -func BulkManageAssignments(s *scheduler.Scheduler) httprouter.Handle { - return func(w http.ResponseWriter, r *http.Request, _ httprouter.Params) { - var req SwitchAssignRequest - if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - http.Error(w, fmt.Sprintf("failed to decode request: %v", err), http.StatusBadRequest) - return - } - if req.AppName == "" { - http.Error(w, "AppName is required", http.StatusBadRequest) - return - } - - klog.Infof("SwitchAssign request for app %s: unassign=%v assign=%v", req.AppName, req.Unassign, req.Assign) - util.GPUManageLock.Lock() - defer util.GPUManageLock.Unlock() - - nodes, err := s.ListNodes() - if err != nil { - klog.Errorln(err) - http.Error(w, fmt.Sprintf("failed to list nodes: %v", err), http.StatusInternalServerError) - return - } - - // Build device maps - uuidToDevice := make(map[string]util.DeviceInfo) - uuidToNodeName := make(map[string]string) - for _, node := range nodes { - for _, device := range node.Devices { - uuidToDevice[device.ID] = device - uuidToNodeName[device.ID] = node.Node.Name - } - } - - bindings, err := s.ListGPUBindings() - if err != nil { - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - // Group existing bindings - currentBindingsByUUID := make(map[string]*v1alpha1.GPUBinding) - bindingsByUUID := make(map[string][]*v1alpha1.GPUBinding) - for _, b := range bindings { - bindingsByUUID[b.Spec.UUID] = append(bindingsByUUID[b.Spec.UUID], b) - if b.Spec.AppName == req.AppName { - currentBindingsByUUID[b.Spec.UUID] = b - } - } - - // Build a quick lookup for GPUs that will be assigned, so unassign won't remove them - assignIDs := make(map[string]struct{}) - for _, it := range req.Assign { - if it.ID != "" { - assignIDs[it.ID] = struct{}{} - } - } - - // Plan unassignments (skip any UUID that is also in the assign list) - toUnassignNames := make([]string, 0) - unassignSet := make(map[string]struct{}) - for _, it := range req.Unassign { - if it.ID == "" { - continue - } - if _, willAssign := assignIDs[it.ID]; willAssign { - continue - } - if b := currentBindingsByUUID[it.ID]; b != nil { - toUnassignNames = append(toUnassignNames, b.Name) - unassignSet[it.ID] = struct{}{} - } - } - - // Plan assignments (patches for mem changes, creates for new bindings, evictions for exclusive) - type patchItem struct { - old *v1alpha1.GPUBinding - mem *resource.Quantity - } - patches := make([]patchItem, 0) - type createItem struct { - binding *v1alpha1.GPUBinding - } - creates := make([]createItem, 0) - evictUUIDs := make(map[string]struct{}) - deleteOtherBindingNames := make([]string, 0) - - seenAssign := make(map[string]struct{}) - for _, it := range req.Assign { - if it.ID == "" { - continue - } - if _, duplicated := seenAssign[it.ID]; duplicated { - continue - } - seenAssign[it.ID] = struct{}{} - - dev, ok := uuidToDevice[it.ID] - if !ok { - http.Error(w, fmt.Sprintf("GPU %s not found", it.ID), http.StatusNotFound) - return - } - - if existing := currentBindingsByUUID[it.ID]; existing != nil { - // Already bound to this app - if dev.ShareMode != util.ShareModeMemSlicing { - // In exclusive/time-slicing, reassign to same GPU is a no-op - continue - } - // mem-slicing: treat missing/zero or unchanged memory as no-op - if it.Memory == nil || it.Memory.Value() == 0 || - (existing.Spec.Memory != nil && it.Memory.Value() == existing.Spec.Memory.Value()) { - continue - } - // validate memory availability excluding this app's current allocation - totalUsed := int64(0) - for _, b := range bindingsByUUID[it.ID] { - if b.Spec.AppName == req.AppName { - continue - } - if b.Spec.Memory != nil { - totalUsed += b.Spec.Memory.Value() - } - } - if totalUsed+it.Memory.Value() > int64(dev.Devmem) { - err = fmt.Errorf("not enough memory on GPU %s, available: %d, request: %d for app %s", - it.ID, int64(dev.Devmem)-totalUsed, it.Memory.Value(), req.AppName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusConflict) - return - } - patches = append(patches, patchItem{old: existing, mem: it.Memory}) - continue - } - - // Not currently bound to this app - if dev.ShareMode == util.ShareModeMemSlicing { - if it.Memory == nil || it.Memory.Value() == 0 { - err = fmt.Errorf("memory allocation is required for GPU %s in memory slicing mode, refuse assigning to app %s", it.ID, req.AppName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusBadRequest) - return - } - totalUsed := int64(0) - for _, b := range bindingsByUUID[it.ID] { - if b.Spec.Memory != nil { - totalUsed += b.Spec.Memory.Value() - } - } - if totalUsed+it.Memory.Value() > int64(dev.Devmem) { - err = fmt.Errorf("not enough memory available on GPU %s, available: %d, request: %d, refuse assigning to app %s", - it.ID, int64(dev.Devmem)-totalUsed, it.Memory.Value(), req.AppName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusConflict) - return - } - } else if dev.ShareMode == util.ShareModeExclusive { - // Plan eviction of other app(s) holding this GPU - for _, b := range bindingsByUUID[it.ID] { - if b.Spec.AppName != req.AppName { - deleteOtherBindingNames = append(deleteOtherBindingNames, b.Name) - evictUUIDs[it.ID] = struct{}{} - } - } - } - - // Prepare new binding - mem := it.Memory - if dev.ShareMode != util.ShareModeMemSlicing { - mem = nil - } - newBinding := &v1alpha1.GPUBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: strings.ToLower(fmt.Sprintf("%s-%s-%d", req.AppName, it.ID, time.Now().Unix())), - }, - Spec: v1alpha1.GPUBindingSpec{ - UUID: it.ID, - AppName: req.AppName, - PodSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - util.AppNameLabelKey: req.AppName, - }, - }, - Memory: mem, - }, - } - creates = append(creates, createItem{binding: newBinding}) - } - - // Determine final set of UUIDs for the app after changes (for node policy check) - finalUUIDSet := make(map[string]struct{}) - for uuid := range currentBindingsByUUID { - if _, toUn := unassignSet[uuid]; !toUn { - finalUUIDSet[uuid] = struct{}{} - } - } - for _, c := range creates { - finalUUIDSet[c.binding.Spec.UUID] = struct{}{} - } - - // If no effective change to app's own bindings, return OK without restarting its pods - if len(toUnassignNames) == 0 && len(patches) == 0 && len(creates) == 0 { - w.WriteHeader(http.StatusOK) - return - } - - // Enforce single-node binding policy - nodeSet := make(map[string]struct{}) - for uuid := range finalUUIDSet { - nodeName := uuidToNodeName[uuid] - if nodeName == "" { - http.Error(w, fmt.Sprintf("GPU %s not found", uuid), http.StatusNotFound) - return - } - nodeSet[nodeName] = struct{}{} - } - if len(nodeSet) > 1 { - err = fmt.Errorf("app %s binding spans multiple nodes which is not allowed", req.AppName) - klog.Warningln(err) - http.Error(w, err.Error(), http.StatusConflict) - return - } - - // Execute plan - // 1) Evict other apps for exclusive GPUs - if len(evictUUIDs) > 0 { - pods := s.ListPodsInfo() - for _, pod := range pods { - for _, pdev := range pod.Devices { - for _, cdevs := range pdev { - for _, cdev := range cdevs { - if _, needEvict := evictUUIDs[cdev.UUID]; needEvict { - klog.Infof("Evicting pod %s/%s occupying exclusive GPU %s", pod.Namespace, pod.Name, cdev.UUID) - if err := s.DeletePodFromCluster(r.Context(), s.PodInfoToPodObj(pod)); err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - } - } - } - for _, name := range deleteOtherBindingNames { - if err := ctrlclient.IgnoreNotFound(util.DeleteGPUBinding(r.Context(), name)); err != nil { - err = fmt.Errorf("failed to delete existing GPUBinding %s: %v", name, err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - } - - // 2) Restart this app's pods due to binding changes - if err := s.DeletePodsBelongToApp(r.Context(), req.AppName); err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - - // 3) Apply unassignments - for _, name := range toUnassignNames { - if err := ctrlclient.IgnoreNotFound(util.DeleteGPUBinding(r.Context(), name)); err != nil { - err = fmt.Errorf("failed to delete GPUBinding %s: %v", name, err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - - // 4) Apply memory patches - for _, p := range patches { - newBinding := p.old.DeepCopy() - newBinding.Spec.Memory = p.mem - if err := client.GPUClient.Patch(r.Context(), newBinding, ctrlclient.MergeFrom(p.old)); err != nil { - err = fmt.Errorf("failed to patch GPUBinding %s: %v", p.old.Name, err) - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - - // 5) Create new bindings - for _, c := range creates { - if err := s.CreateGPUBinding(r.Context(), c.binding); err != nil { - klog.Errorln(err) - http.Error(w, err.Error(), http.StatusInternalServerError) - return - } - } - - w.WriteHeader(http.StatusOK) - } -} diff --git a/pkg/scheduler/scheduler.go b/pkg/scheduler/scheduler.go index 1ea5dc7f5..03378198e 100644 --- a/pkg/scheduler/scheduler.go +++ b/pkg/scheduler/scheduler.go @@ -29,7 +29,6 @@ import ( corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/informers" @@ -319,133 +318,6 @@ func (s *Scheduler) RegisterFromNodeAnnotations() { } } -func (s *Scheduler) CleanupGPUBindingsLoop() { - klog.InfoS("CleanupGPUBindingsLoop: delaying start", "delay", config.CleanupStartupDelay) - timer := time.NewTimer(config.CleanupStartupDelay) - defer timer.Stop() - select { - case <-timer.C: - case <-s.stopCh: - return - } - - klog.InfoS("Starting CleanupGPUBindingsLoop") - defer klog.InfoS("Exiting CleanupGPUBindingsLoop") - ticker := time.NewTicker(15 * time.Second) - defer ticker.Stop() - - for { - select { - case <-ticker.C: - util.GPUManageLock.Lock() - func() { - defer util.GPUManageLock.Unlock() - nodes, err := s.ListNodes() - if err != nil { - klog.ErrorS(err, "CleanupGPUBindingsLoop: failed to list nodes") - return - } - validUUIDs := make(map[string]struct{}) - for _, n := range nodes { - for _, d := range n.Devices { - validUUIDs[d.ID] = struct{}{} - } - } - bindings, err := s.ListGPUBindings() - if err != nil { - klog.ErrorS(err, "CleanupGPUBindingsLoop: failed to list GPUBindings") - return - } - toDelete := make([]string, 0) - - existingApps := make(map[string]struct{}) - appDiscoveryOK := true - if deps, err := s.kubeClient.AppsV1().Deployments(metav1.NamespaceAll).List(context.Background(), - metav1.ListOptions{LabelSelector: util.AppNameLabelKey}); err != nil { - klog.ErrorS(err, "CleanupGPUBindingsLoop: failed to list Deployments for existing apps") - appDiscoveryOK = false - } else { - for i := range deps.Items { - if app := deps.Items[i].Labels[util.AppNameLabelKey]; app != "" { - existingApps[app] = struct{}{} - } - } - } - if appDiscoveryOK { - if ssets, err := s.kubeClient.AppsV1().StatefulSets(metav1.NamespaceAll).List(context.Background(), - metav1.ListOptions{LabelSelector: util.AppNameLabelKey}); err != nil { - klog.ErrorS(err, "CleanupGPUBindingsLoop: failed to list StatefulSets for existing apps") - appDiscoveryOK = false - } else { - for i := range ssets.Items { - if app := ssets.Items[i].Labels[util.AppNameLabelKey]; app != "" { - existingApps[app] = struct{}{} - } - } - } - } - if appDiscoveryOK { - if pods, err := s.kubeClient.CoreV1().Pods(metav1.NamespaceAll).List(context.Background(), - metav1.ListOptions{LabelSelector: util.AppNameLabelKey}); err != nil { - klog.ErrorS(err, "CleanupGPUBindingsLoop: failed to list Pods for existing apps") - appDiscoveryOK = false - } else { - for i := range pods.Items { - if app := pods.Items[i].Labels[util.AppNameLabelKey]; app != "" { - existingApps[app] = struct{}{} - } - } - } - } - - type key struct { - app string - uuid string - } - group := make(map[key][]*v1alpha1.GPUBinding) - for _, b := range bindings { - if _, ok := validUUIDs[b.Spec.UUID]; !ok { - toDelete = append(toDelete, b.Name) - continue - } - if appDiscoveryOK { - if _, ok := existingApps[b.Spec.AppName]; !ok { - toDelete = append(toDelete, b.Name) - continue - } - } - k := key{app: b.Spec.AppName, uuid: b.Spec.UUID} - group[k] = append(group[k], b) - } - for _, list := range group { - if len(list) <= 1 { - continue - } - sort.SliceStable(list, func(i, j int) bool { - return list[i].CreationTimestamp.Time.Before(list[j].CreationTimestamp.Time) - }) - for idx := 0; idx < len(list)-1; idx++ { - toDelete = append(toDelete, list[idx].Name) - } - } - if len(toDelete) == 0 { - return - } - klog.InfoS("CleanupGPUBindingsLoop: deleting stale/duplicate GPUBindings", "count", len(toDelete)) - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) - defer cancel() - for _, name := range toDelete { - if err := ctrlclient.IgnoreNotFound(util.DeleteGPUBinding(ctx, name)); err != nil { - klog.ErrorS(err, "CleanupGPUBindingsLoop: failed to delete GPUBinding", "name", name) - } - } - }() - case <-s.stopCh: - return - } - } -} - // CleanupPodsWithMissingDevicesLoop periodically cleans up pods that are assigned // devices which no longer exist in the cluster. func (s *Scheduler) CleanupPodsWithMissingDevicesLoop() { @@ -758,10 +630,48 @@ func normalizeGPUUUID(uuid string) string { return uuid } -func (s *Scheduler) collectConsumedGPUUUIDsByApp(appName string, currentPod *corev1.Pod) map[string]struct{} { +type appBindingIdentity struct { + appName string + owner string + namespace string +} + +func podBindingIdentity(pod *corev1.Pod) appBindingIdentity { + id := appBindingIdentity{} + if pod == nil { + return id + } + id.namespace = pod.Namespace + if pod.Labels != nil { + id.appName = pod.Labels[util.AppNameLabelKey] + id.owner = pod.Labels[util.AppOwnerLabelKey] + } + return id +} + +func bindingMatchesIdentity(binding *v1alpha1.GPUBinding, id appBindingIdentity) bool { + if binding == nil || binding.Spec.UUID == "" || binding.Spec.AppName != id.appName { + return false + } + if binding.Spec.Owner != "" && binding.Spec.Owner != id.owner { + return false + } + if binding.Spec.Namespace != "" && binding.Spec.Namespace != id.namespace { + return false + } + return true +} + +func (s *Scheduler) collectConsumedGPUUUIDsByApp(identity appBindingIdentity, currentPod *corev1.Pod) map[string]struct{} { consumed := make(map[string]struct{}) for _, p := range s.ListPodsInfo() { - if p.Labels == nil || p.Labels[util.AppNameLabelKey] != appName { + if p.Labels == nil || p.Labels[util.AppNameLabelKey] != identity.appName { + continue + } + if identity.owner != "" && p.Labels[util.AppOwnerLabelKey] != identity.owner { + continue + } + if identity.namespace != "" && p.Namespace != identity.namespace { continue } if currentPod != nil && p.Namespace == currentPod.Namespace && p.Name == currentPod.Name { @@ -781,120 +691,6 @@ func (s *Scheduler) collectConsumedGPUUUIDsByApp(appName string, currentPod *cor return consumed } -func (s *Scheduler) selectDynamicGPUCandidates( - nodes map[string]*util.NodeInfo, - eligibleNodes map[string]struct{}, - uuidToNode map[string]string, - appBoundUUIDs map[string]struct{}, - alreadySelected map[string]struct{}, - consumedByApp map[string]struct{}, - allBindings []*v1alpha1.GPUBinding, - requiredCount int, - requestSummary nvidiaRequestSummary, -) ([]string, error) { - if requiredCount <= 0 { - return nil, nil - } - // todo: needs more flexibility - // when we allow an app to be bound to multiple nodes - // already consumed GPUs by this app should not be considered as constraints - // and not consumed GPUs may be used with or without other candidates - // e.g. if an app has 3 GPUs, with 1 consumed, 2 not consumed, it can be bound to any of the 2 not consumed GPUs - // with another not bound GPU - // or totally other 2 GPUs - pinnedNode := "" - for uuid := range appBoundUUIDs { - if nodeName, ok := uuidToNode[uuid]; ok { - if _, eligible := eligibleNodes[nodeName]; !eligible { - continue - } - pinnedNode = nodeName - break - } - } - - bindingCount := make(map[string]int) - bindingAllocatedMemory := make(map[string]int64) - for _, b := range allBindings { - if b.Spec.UUID == "" { - continue - } - bindingCount[b.Spec.UUID]++ - if b.Spec.Memory != nil { - bindingAllocatedMemory[b.Spec.UUID] += b.Spec.Memory.Value() - } - } - - memSlicingCandidates := make([]string, 0) - exclusiveCandidates := make([]string, 0) - timeSlicingCandidates := make([]string, 0) - - for _, n := range nodes { - if _, eligible := eligibleNodes[n.Node.Name]; !eligible { - continue - } - if pinnedNode != "" && n.Node.Name != pinnedNode { - continue - } - for _, d := range n.Devices { - uuid := d.ID - if uuid == "" || !d.Health { - continue - } - if _, ok := appBoundUUIDs[uuid]; ok { - continue - } - if _, ok := alreadySelected[uuid]; ok { - continue - } - if _, ok := consumedByApp[uuid]; ok { - continue - } - - switch d.ShareMode { - case util.ShareModeMemSlicing: - if !requestSummary.hasMemory { - continue - } - requiredMemory := requiredNvidiaMemoryBytes(requestSummary, int64(d.Devmem)) - remaining := int64(d.Devmem) - bindingAllocatedMemory[uuid] - if remaining >= requiredMemory { - memSlicingCandidates = append(memSlicingCandidates, uuid) - } - case util.ShareModeExclusive: - if bindingCount[uuid] > 0 { - continue - } - if requestSummary.hasMemory { - requiredMemory := requiredNvidiaMemoryBytes(requestSummary, int64(d.Devmem)) - if int64(d.Devmem) < requiredMemory { - continue - } - } - exclusiveCandidates = append(exclusiveCandidates, uuid) - case util.ShareModeTimeSlicing: - if requestSummary.hasMemory { - requiredMemory := requiredNvidiaMemoryBytes(requestSummary, int64(d.Devmem)) - if int64(d.Devmem) < requiredMemory { - continue - } - } - timeSlicingCandidates = append(timeSlicingCandidates, uuid) - } - } - } - - result := make([]string, 0) - if requestSummary.hasMemory && len(memSlicingCandidates) > 0 { - result = append(result, memSlicingCandidates...) - } else if len(exclusiveCandidates) > 0 { - result = append(result, exclusiveCandidates...) - } else { - result = append(result, timeSlicingCandidates...) - } - return result, nil -} - func (s *Scheduler) Bind(args extenderv1.ExtenderBindingArgs) (*extenderv1.ExtenderBindingResult, error) { klog.InfoS("Attempting to bind pod to node", "pod", args.PodName, "namespace", args.PodNamespace, "node", args.Node) var res *extenderv1.ExtenderBindingResult @@ -982,10 +778,8 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi if annos == nil { annos = make(map[string]string) } - appName := "" - if args.Pod.Labels != nil { - appName = args.Pod.Labels[util.AppNameLabelKey] - } + identity := podBindingIdentity(args.Pod) + appName := identity.appName if appName == "" { err := fmt.Errorf("cannot schedule pod without %s label", util.AppNameLabelKey) s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) @@ -1000,37 +794,17 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi }, nil } - eligibleNodes := make(map[string]struct{}) - if args.NodeNames != nil { - for _, nodeName := range *args.NodeNames { - eligibleNodes[nodeName] = struct{}{} - } - } - bindings, err := s.ListGPUBindings() if err != nil { klog.ErrorS(err, "Failed to list GPUBindings for Filter", "pod", klog.KObj(args.Pod)) s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) return nil, err } - nodes, err := s.ListNodes() - if err != nil { - klog.ErrorS(err, "Failed to list nodes for Filter", "pod", klog.KObj(args.Pod)) - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err - } - uuidToNode := make(map[string]string) - for _, node := range nodes { - for _, dev := range node.Devices { - uuidToNode[dev.ID] = node.Node.Name - } - } appBoundByUUID := make(map[string]*v1alpha1.GPUBinding) - appBoundUUIDs := make(map[string]struct{}) matchedBindings := make([]*v1alpha1.GPUBinding, 0) for _, b := range bindings { - if b.Spec.AppName != appName || b.Spec.UUID == "" { + if !bindingMatchesIdentity(b, identity) { continue } matchedPod := b.MatchPod(args.Pod) @@ -1056,7 +830,6 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi // } // continue // } - appBoundUUIDs[b.Spec.UUID] = struct{}{} if _, ok := appBoundByUUID[b.Spec.UUID]; !ok { appBoundByUUID[b.Spec.UUID] = b } @@ -1075,7 +848,7 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi if args.Pod.Labels != nil { policyMode = args.Pod.Labels[nvidia.AppPodGPUConsumePolicyKey] } - consumedByApp := s.collectConsumedGPUUUIDsByApp(appName, args.Pod) + consumedByApp := s.collectConsumedGPUUUIDsByApp(identity, args.Pod) if policyMode == "" || policyMode == nvidia.AppPodGPUConsumePolicyAll { if len(matchedBindings) > 0 { for _, b := range matchedBindings { @@ -1122,27 +895,7 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi } if nvidiaSummary.requested > 0 && len(selectedUUIDs) < nvidiaSummary.requested { - dynamicCandidates, err := s.selectDynamicGPUCandidates( - nodes, - eligibleNodes, - uuidToNode, - appBoundUUIDs, - selectedUUIDSet, - consumedByApp, - bindings, - nvidiaSummary.requested-len(selectedUUIDs), - nvidiaSummary, - ) - if err != nil { - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err - } - for _, uuid := range dynamicCandidates { - appendSelectedUUID(uuid) - } - } - if nvidiaSummary.requested > 0 && len(selectedUUIDs) < nvidiaSummary.requested { - err := fmt.Errorf("insufficient GPU candidates for app %s, requested=%d, available=%d", appName, nvidiaSummary.requested, len(selectedUUIDs)) + err := fmt.Errorf("insufficient GPUBindings for app %s, requested=%d, bound=%d", appName, nvidiaSummary.requested, len(selectedUUIDs)) s.recordScheduleFilterResultEvent(args.Pod, EventReasonInsufficientGPU, "", err) return &extenderv1.ExtenderFilterResult{ FailedNodes: map[string]string{}, @@ -1183,16 +936,6 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi devlist, ok := m.Devices[nvidia.NvidiaGPUDevice] if ok && len(devlist) > 0 { - nodeInfo, err := s.GetNode(m.NodeID) - if err != nil { - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err - } - shareModeByUUID := make(map[string]string) - for _, d := range nodeInfo.Devices { - shareModeByUUID[d.ID] = d.ShareMode - } - allocatedUUIDs := make(map[string]struct{}) for _, cdev := range devlist { for _, dev := range cdev { @@ -1203,61 +946,12 @@ func (s *Scheduler) Filter(args extenderv1.ExtenderArgs) (*extenderv1.ExtenderFi allocatedUUIDs[uuid] = struct{}{} } } - bindingAllocatedMemory := make(map[string]int64) - for _, b := range bindings { - if b.Spec.UUID == "" || b.Spec.Memory == nil { - continue - } - bindingAllocatedMemory[b.Spec.UUID] += b.Spec.Memory.Value() - } - deviceTotalMemByUUID := make(map[string]int64) - for _, d := range nodeInfo.Devices { - deviceTotalMemByUUID[d.ID] = int64(d.Devmem) - } for uuid := range allocatedUUIDs { - if _, exists := appBoundByUUID[uuid]; exists { - continue - } - autoBinding := &v1alpha1.GPUBinding{ - ObjectMeta: metav1.ObjectMeta{ - Name: strings.ToLower(fmt.Sprintf("%s-%s-%d", appName, uuid, time.Now().Unix())), - }, - Spec: v1alpha1.GPUBindingSpec{ - UUID: uuid, - AppName: appName, - PodSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - util.AppNameLabelKey: appName, - }, - }, - }, - } - if shareModeByUUID[uuid] == util.ShareModeMemSlicing { - totalMem := deviceTotalMemByUUID[uuid] - requiredMem := requiredNvidiaMemoryBytes(nvidiaSummary, totalMem) - if requiredMem <= 0 { - err := fmt.Errorf("invalid mem-slicing GPU memory request for binding on %s: request=%d", uuid, requiredMem) - klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name, "uuid", uuid) - s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) - return nil, err - } - if totalMem > 0 && bindingAllocatedMemory[uuid]+requiredMem > totalMem { - err := fmt.Errorf("insufficient mem-slicing GPU memory for binding on %s: allocated=%d, request=%d, total=%d", uuid, bindingAllocatedMemory[uuid], requiredMem, totalMem) - klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name, "uuid", uuid) - s.recordScheduleFilterResultEvent(args.Pod, EventReasonInsufficientGPU, "", err) - return nil, err - } - memQ := resource.NewQuantity(requiredMem, resource.BinarySI) - autoBinding.Spec.Memory = memQ - bindingAllocatedMemory[uuid] += requiredMem - } - err := s.CreateGPUBinding(context.Background(), autoBinding) - if err != nil { - klog.ErrorS(err, "Failed to create GPUBinding automatically", "pod", args.Pod.Name, "uuid", uuid) + if _, exists := appBoundByUUID[uuid]; !exists { + err := fmt.Errorf("allocated GPU %s for app %s has no matching GPUBinding", uuid, appName) s.recordScheduleFilterResultEvent(args.Pod, EventReasonFilteringFailed, "", err) return nil, err } - appBoundByUUID[uuid] = autoBinding } } diff --git a/pkg/util/types.go b/pkg/util/types.go index 56f2b749c..f4602f691 100644 --- a/pkg/util/types.go +++ b/pkg/util/types.go @@ -34,7 +34,8 @@ const ( ShareModeMemSlicing = "1" ShareModeTimeSlicing = "2" - AppNameLabelKey = "applications.app.bytetrade.io/name" + AppNameLabelKey = "applications.app.bytetrade.io/name" + AppOwnerLabelKey = "applications.app.bytetrade.io/owner" DeviceBindAllocating = "allocating" DeviceBindFailed = "failed"