From 6257b1640c9619067314a488f3ac10ab6010ee5f Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 26 May 2026 00:56:46 +0200 Subject: [PATCH 1/4] integration with OpenVINO Model Server --- README.md | 69 ++++--- main.go | 8 + pkg/distribution/builder/from_directory.go | 15 +- .../builder/from_directory_test.go | 39 ++++ pkg/distribution/huggingface/model.go | 38 ++-- pkg/distribution/huggingface/repository.go | 34 ++++ .../huggingface/repository_test.go | 50 +++++ pkg/envconfig/envconfig.go | 6 + pkg/inference/backends/ovms/ovms.go | 171 ++++++++++++++++++ pkg/inference/backends/ovms/ovms_test.go | 49 +++++ pkg/inference/backends/runner.go | 2 +- pkg/inference/scheduling/http_handler.go | 5 + pkg/inference/scheduling/runner.go | 13 +- 13 files changed, 449 insertions(+), 50 deletions(-) create mode 100644 pkg/inference/backends/ovms/ovms.go create mode 100644 pkg/inference/backends/ovms/ovms_test.go diff --git a/README.md b/README.md index 56b428503..46a7689bc 100644 --- a/README.md +++ b/README.md @@ -300,6 +300,34 @@ docker buildx build \ The vLLM wheels are sourced from the official vLLM GitHub Releases at `https://github.com/vllm-project/vllm/releases`, which provides prebuilt wheels for each release version. +### OVMS integration + +Docker Model Runner can also run an OVMS backend. + +- Default OVMS binary path: `./ovms/bin/ovms` +- Override binary path with: `OVMS_SERVER_PATH` + +When the runner starts, it will try to initialize OVMS as an available backend. If you are running from source and want to use a custom OVMS binary, set: + +```sh +OVMS_SERVER_PATH=/absolute/path/to/ovms ./model-runner +``` + +You can target OVMS explicitly through the backend-prefixed OpenAI-compatible routes: + +```sh +# List models exposed via OVMS backend routing +curl http://localhost:13434/engines/ovms/v1/models + +# Example chat/completions call through OVMS backend routing +curl http://localhost:13434/engines/ovms/v1/chat/completions -X POST -d '{ + "model": "hf.co/OpenVINO/Qwen3-0.6B-int4-ov", + "messages": [ + {"role": "user", "content": "Hello from OVMS"} + ] +}' +``` + ## API Examples The Model Runner exposes a REST API that can be accessed via TCP port. You can interact with it using curl commands. @@ -310,17 +338,17 @@ When running with `docker-run`, you can use regular HTTP requests: ```sh # List all available models -curl http://localhost:8080/models +curl http://localhost:13434/models # Create a new model -curl http://localhost:8080/models/create -X POST -d '{"from": "ai/smollm2"}' +curl http://localhost:13434/models/create -X POST -d '{"from": "hf.co/OpenVINO/Qwen3-0.6B-int4-ov"}' # Get information about a specific model -curl http://localhost:8080/models/ai/smollm2 +curl http://localhost:13434/models/hf.co/OpenVINO/Qwen3-0.6B-int4-ov # Chat with a model -curl http://localhost:8080/engines/llama.cpp/v1/chat/completions -X POST -d '{ - "model": "ai/smollm2", +curl http://localhost:13434/engines/ovms/v1/chat/completions -X POST -d '{ + "model": "hf.co/OpenVINO/Qwen3-0.6B-int4-ov", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello, how are you?"} @@ -328,37 +356,8 @@ curl http://localhost:8080/engines/llama.cpp/v1/chat/completions -X POST -d '{ }' # Delete a model -curl http://localhost:8080/models/ai/smollm2 -X DELETE +curl http://localhost:13434/models/hf.co/OpenVINO/Qwen3-0.6B-int4-ov -X DELETE -# Get metrics -curl http://localhost:8080/metrics -``` - -The response will contain the model's reply: - -```json -{ - "id": "chat-12345", - "object": "chat.completion", - "created": 1682456789, - "model": "ai/smollm2", - "choices": [ - { - "index": 0, - "message": { - "role": "assistant", - "content": "I'm doing well, thank you for asking! How can I assist you today?" - }, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 24, - "completion_tokens": 16, - "total_tokens": 40 - } -} -``` ### Features diff --git a/main.go b/main.go index 8034d9553..1e9cfd2ff 100644 --- a/main.go +++ b/main.go @@ -19,6 +19,7 @@ import ( "github.com/docker/model-runner/pkg/inference" "github.com/docker/model-runner/pkg/inference/backends/diffusers" "github.com/docker/model-runner/pkg/inference/backends/llamacpp" + "github.com/docker/model-runner/pkg/inference/backends/ovms" "github.com/docker/model-runner/pkg/inference/backends/sglang" "github.com/docker/model-runner/pkg/inference/config" "github.com/docker/model-runner/pkg/inference/models" @@ -64,6 +65,7 @@ func main() { sglangServerPath := envconfig.SGLangServerPath() mlxServerPath := envconfig.MLXServerPath() diffusersServerPath := envconfig.DiffusersServerPath() + ovmsServerPath := envconfig.OVMSServerPath() vllmMetalServerPath := envconfig.VLLMMetalServerPath() // Create a proxy-aware HTTP transport @@ -92,6 +94,9 @@ func main() { if vllmMetalServerPath != "" { log.Info("VLLM_METAL_SERVER_PATH", "path", vllmMetalServerPath) } + if ovmsServerPath != "" { + log.Info("OVMS_SERVER_PATH", "path", ovmsServerPath) + } // Create llama.cpp configuration from environment variables llamaCppConfig, err := createLlamaCppConfigFromEnv() @@ -129,6 +134,9 @@ func main() { IncludeDiffusers: true, DiffusersPath: diffusersServerPath, }), + routing.BackendDef{Name: ovms.Name, Init: func(mm *models.Manager) (inference.Backend, error) { + return ovms.New(log, mm, log.With("component", ovms.Name), ovmsServerPath) + }}, routing.BackendDef{Name: sglang.Name, Init: func(mm *models.Manager) (inference.Backend, error) { return sglang.New(log, mm, log.With("component", sglang.Name), nil, sglangServerPath) }}, diff --git a/pkg/distribution/builder/from_directory.go b/pkg/distribution/builder/from_directory.go index fdef17114..1362f6fe4 100644 --- a/pkg/distribution/builder/from_directory.go +++ b/pkg/distribution/builder/from_directory.go @@ -32,6 +32,10 @@ type DirectoryOptions struct { // When set, it overrides the default behavior of using time.Now(). // This is useful for producing deterministic OCI digests. Created *time.Time + + // AllowNoWeightFiles allows packaging a directory even when it contains no + // GGUF/SafeTensors/DDUF weight files. + AllowNoWeightFiles bool } // DirectoryOption is a functional option for configuring FromDirectory. @@ -62,6 +66,15 @@ func WithCreatedTime(t time.Time) DirectoryOption { } } +// WithAllowNoWeightFiles allows FromDirectory to succeed even when no standard +// model weight files are present. This is used for formats such as OpenVINO IR +// where model files are represented differently (for example .xml + .bin pairs). +func WithAllowNoWeightFiles() DirectoryOption { + return func(opts *DirectoryOptions) { + opts.AllowNoWeightFiles = true + } +} + // FromDirectory creates a Builder from a directory containing model files. // It recursively scans the directory and adds each non-hidden file as a separate layer. // Each layer's filepath annotation preserves the relative path from the directory root. @@ -195,7 +208,7 @@ func FromDirectory(dirPath string, opts ...DirectoryOption) (*Builder, error) { return nil, fmt.Errorf("no files found in directory: %s", dirPath) } - if len(weightFiles) == 0 { + if len(weightFiles) == 0 && !options.AllowNoWeightFiles { return nil, fmt.Errorf("no weight files (safetensors, GGUF, or DDUF) found in directory: %s", dirPath) } diff --git a/pkg/distribution/builder/from_directory_test.go b/pkg/distribution/builder/from_directory_test.go index 1fc1145aa..7dca73a99 100644 --- a/pkg/distribution/builder/from_directory_test.go +++ b/pkg/distribution/builder/from_directory_test.go @@ -3,6 +3,7 @@ package builder import ( "os" "path/filepath" + "strings" "testing" "time" ) @@ -170,6 +171,44 @@ func TestFromDirectoryWithExclusions(t *testing.T) { } } +func TestFromDirectoryNoStandardWeights(t *testing.T) { + tmpDir := t.TempDir() + createTestFile(t, tmpDir, "openvino/model.xml", "") + createTestFile(t, tmpDir, "openvino/model.bin", "weights") + createTestFile(t, tmpDir, "openvino/config.json", "{}") + + _, err := FromDirectory(tmpDir) + if err == nil { + t.Fatal("expected error when directory has no GGUF/SafeTensors/DDUF weights") + } + + if got := err.Error(); got == "" || !strings.Contains(got, "no weight files") { + t.Fatalf("expected no weight files error, got %q", got) + } +} + +func TestFromDirectoryAllowNoWeightFiles(t *testing.T) { + tmpDir := t.TempDir() + createTestFile(t, tmpDir, "openvino/model.xml", "") + createTestFile(t, tmpDir, "openvino/model.bin", "weights") + createTestFile(t, tmpDir, "openvino/config.json", "{}") + + b, err := FromDirectory(tmpDir, WithAllowNoWeightFiles()) + if err != nil { + t.Fatalf("FromDirectory with WithAllowNoWeightFiles failed: %v", err) + } + + mdl := b.Model() + layers, err := mdl.Layers() + if err != nil { + t.Fatalf("Failed to get layers: %v", err) + } + + if len(layers) != 3 { + t.Errorf("Expected 3 layers, got %d", len(layers)) + } +} + func TestShouldExclude(t *testing.T) { tests := []struct { name string diff --git a/pkg/distribution/huggingface/model.go b/pkg/distribution/huggingface/model.go index c714bfd67..75562f8f0 100644 --- a/pkg/distribution/huggingface/model.go +++ b/pkg/distribution/huggingface/model.go @@ -31,8 +31,9 @@ func BuildModel(ctx context.Context, client *Client, repo, revision, tag string, // Filter to model files (weights + configs) weightFiles, configFiles := FilterModelFiles(files) + isOpenVINORepo := IsOpenVINOModel(files) - if len(weightFiles) == 0 { + if len(weightFiles) == 0 && !isOpenVINORepo { return nil, fmt.Errorf("no model weight files (GGUF or SafeTensors) found in repository %s", repo) } @@ -54,10 +55,20 @@ func BuildModel(ctx context.Context, client *Client, repo, revision, tag string, } } - // Combine all files to download - allFiles := append(weightFiles, configFiles...) - if mmprojFile != nil { - allFiles = append(allFiles, *mmprojFile) + // Combine all files to download. + // For OpenVINO repositories, pull all repository files so the full IR layout is preserved. + var allFiles []RepoFile + if isOpenVINORepo { + for _, f := range files { + if f.Type == "file" { + allFiles = append(allFiles, f) + } + } + } else { + allFiles = append(weightFiles, configFiles...) + if mmprojFile != nil { + allFiles = append(allFiles, *mmprojFile) + } } if progressWriter != nil { @@ -90,7 +101,7 @@ func BuildModel(ctx context.Context, client *Client, repo, revision, tag string, _ = progress.WriteProgress(progressWriter, "Building model artifact...", 0, 0, 0, "", "pull") } - model, err := buildModelFromFiles(result.LocalPaths, weightFiles, configFiles, tempDir, createdTime) + model, err := buildModelFromFiles(result.LocalPaths, weightFiles, configFiles, tempDir, createdTime, isOpenVINORepo) if err != nil { return nil, fmt.Errorf("build model: %w", err) } @@ -103,26 +114,29 @@ func BuildModel(ctx context.Context, client *Client, repo, revision, tag string, // which preserves directory structure and adds each file as an individual layer with // filepath annotations. For GGUF models, it uses the V0.1 packaging (FromPaths) // for backward compatibility. -func buildModelFromFiles(localPaths map[string]string, weightFiles, configFiles []RepoFile, tempDir string, createdTime *time.Time) (types.ModelArtifact, error) { - // Check if this is a safetensors model - use V0.2 packaging - if isSafetensorsModel(weightFiles) { - return buildSafetensorsModelV02(tempDir, createdTime) +func buildModelFromFiles(localPaths map[string]string, weightFiles, configFiles []RepoFile, tempDir string, createdTime *time.Time, allowNoStandardWeights bool) (types.ModelArtifact, error) { + // Safetensors and OpenVINO repos are packaged with V0.2 layer-per-file packaging. + if isSafetensorsModel(weightFiles) || allowNoStandardWeights { + return buildDirectoryModelV02(tempDir, createdTime, allowNoStandardWeights) } // For GGUF models, use V0.1 packaging (backward compatible) return buildGGUFModelV01(localPaths, weightFiles, configFiles, createdTime) } -// buildSafetensorsModelV02 builds a safetensors model using V0.2 layer-per-file packaging. +// buildDirectoryModelV02 builds a model using V0.2 layer-per-file packaging. // It uses builder.FromDirectory which recursively scans the tempDir and creates one layer // per file, preserving nested directory structure with filepath annotations. // If createdTime is non-nil, it is used as the creation timestamp for the OCI config // to produce deterministic digests. Otherwise time.Now() is used. -func buildSafetensorsModelV02(tempDir string, createdTime *time.Time) (types.ModelArtifact, error) { +func buildDirectoryModelV02(tempDir string, createdTime *time.Time, allowNoStandardWeights bool) (types.ModelArtifact, error) { var dirOpts []builder.DirectoryOption if createdTime != nil { dirOpts = append(dirOpts, builder.WithCreatedTime(*createdTime)) } + if allowNoStandardWeights { + dirOpts = append(dirOpts, builder.WithAllowNoWeightFiles()) + } b, err := builder.FromDirectory(tempDir, dirOpts...) if err != nil { diff --git a/pkg/distribution/huggingface/repository.go b/pkg/distribution/huggingface/repository.go index 1c7bb3f86..cd5bbd85b 100644 --- a/pkg/distribution/huggingface/repository.go +++ b/pkg/distribution/huggingface/repository.go @@ -62,6 +62,40 @@ func FilterModelFiles(repoFiles []RepoFile) (weights []RepoFile, configs []RepoF return weights, configs } +// IsOpenVINOModel returns true when the repository contains at least one OpenVINO +// IR weight pair (.xml + .bin with the same stem), including nested paths. +func IsOpenVINOModel(repoFiles []RepoFile) bool { + xmlFiles := make(map[string]struct{}) + binFiles := make(map[string]struct{}) + + for _, f := range repoFiles { + if f.Type != "file" { + continue + } + + ext := strings.ToLower(path.Ext(f.Path)) + if ext != ".xml" && ext != ".bin" { + continue + } + + stem := strings.TrimSuffix(f.Path, path.Ext(f.Path)) + switch ext { + case ".xml": + xmlFiles[stem] = struct{}{} + case ".bin": + binFiles[stem] = struct{}{} + } + } + + for stem := range xmlFiles { + if _, ok := binFiles[stem]; ok { + return true + } + } + + return false +} + // TotalSize calculates the total size of files func TotalSize(repoFiles []RepoFile) int64 { var total int64 diff --git a/pkg/distribution/huggingface/repository_test.go b/pkg/distribution/huggingface/repository_test.go index 16e3a159b..4cbe350ec 100644 --- a/pkg/distribution/huggingface/repository_test.go +++ b/pkg/distribution/huggingface/repository_test.go @@ -102,3 +102,53 @@ func TestIsSafetensorsModel(t *testing.T) { }) } } + +func TestIsOpenVINOModel(t *testing.T) { + tests := []struct { + name string + files []RepoFile + want bool + }{ + { + name: "matching xml/bin pair at root", + files: []RepoFile{ + {Type: "file", Path: "openvino_model.xml"}, + {Type: "file", Path: "openvino_model.bin"}, + }, + want: true, + }, + { + name: "matching xml/bin pair in subdirectory", + files: []RepoFile{ + {Type: "file", Path: "int4/openvino_model.xml"}, + {Type: "file", Path: "int4/openvino_model.bin"}, + {Type: "file", Path: "int4/config.json"}, + }, + want: true, + }, + { + name: "xml without matching bin", + files: []RepoFile{ + {Type: "file", Path: "openvino_model.xml"}, + {Type: "file", Path: "other_model.bin"}, + }, + want: false, + }, + { + name: "no openvino files", + files: []RepoFile{ + {Type: "file", Path: "model.safetensors"}, + {Type: "file", Path: "config.json"}, + }, + want: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := IsOpenVINOModel(tt.files); got != tt.want { + t.Errorf("IsOpenVINOModel() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/pkg/envconfig/envconfig.go b/pkg/envconfig/envconfig.go index 3592b9139..9b7da5e59 100644 --- a/pkg/envconfig/envconfig.go +++ b/pkg/envconfig/envconfig.go @@ -159,6 +159,12 @@ func VLLMMetalServerPath() string { return Var("VLLM_METAL_SERVER_PATH") } +// OVMSServerPath returns the optional path to the OVMS server binary. +// Configured via OVMS_SERVER_PATH. +func OVMSServerPath() string { + return Var("OVMS_SERVER_PATH") +} + // DisableMetrics is true when DISABLE_METRICS is set to a truthy value (e.g. "1"). var DisableMetrics = Bool("DISABLE_METRICS") diff --git a/pkg/inference/backends/ovms/ovms.go b/pkg/inference/backends/ovms/ovms.go new file mode 100644 index 000000000..021b1f00f --- /dev/null +++ b/pkg/inference/backends/ovms/ovms.go @@ -0,0 +1,171 @@ +package ovms + +import ( + "context" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "time" + + "github.com/docker/model-runner/pkg/inference" + "github.com/docker/model-runner/pkg/inference/backends" + "github.com/docker/model-runner/pkg/inference/models" + "github.com/docker/model-runner/pkg/logging" +) + +const ( + // Name is the backend name. + Name = "ovms" + + defaultBinaryPath = "./ovms/bin/ovms" +) + +var ErrOVMSNotFound = errors.New("ovms binary not found") + +type ovms struct { + log logging.Logger + modelManager *models.Manager + serverLog logging.Logger + status string + customBinaryPath string +} + +func New(log logging.Logger, modelManager *models.Manager, serverLog logging.Logger, customBinaryPath string) (inference.Backend, error) { + return &ovms{ + log: log, + modelManager: modelManager, + serverLog: serverLog, + status: inference.FormatNotInstalled(""), + customBinaryPath: customBinaryPath, + }, nil +} + +func (o *ovms) Name() string { + return Name +} + +func (o *ovms) UsesExternalModelManagement() bool { + return false +} + +func (o *ovms) UsesTCP() bool { + return true +} + +func (o *ovms) HealthPath() string { + return "/v2/health/ready" +} + +func (o *ovms) RewritePath(path string) string { + if len(path) > 3 && path[:4] == "/v1/" { + return "/v3/" + path[4:] + } + return path +} + +func (o *ovms) Install(ctx context.Context, _ *http.Client) error { + binary := o.binaryPath() + if _, err := os.Stat(binary); err != nil { + o.status = inference.FormatNotInstalled("") + return ErrOVMSNotFound + } + + checkCtx, cancel := context.WithTimeout(ctx, 5*time.Second) + defer cancel() + + output, err := exec.CommandContext(checkCtx, binary, "--version").Output() + if err != nil { + o.log.Warn("could not get OVMS version", "error", err) + o.status = inference.FormatRunning(inference.DetailVersionUnknown) + return nil + } + + versionLine := strings.TrimSpace(string(output)) + if versionLine == "" { + o.status = inference.FormatRunning(inference.DetailVersionUnknown) + return nil + } + + o.status = inference.FormatRunning(versionLine) + return nil +} + +func (o *ovms) Run(ctx context.Context, socket, model string, modelRef string, _ inference.BackendMode, _ *inference.BackendConfiguration) error { + bundle, err := o.modelManager.GetBundle(model) + if err != nil { + return fmt.Errorf("failed to get model: %w", err) + } + modelPath := resolveOVMSModelPath(bundle.RootDir()) + + _, port, err := net.SplitHostPort(socket) + if err != nil { + return fmt.Errorf("invalid backend socket address %q: %w", socket, err) + } + + // Use the human-readable model reference for --model_name so that + // incoming requests (which carry the original name) match. + modelName := modelRef + if modelName == "" { + modelName = model + } + logLevel := ovmsLogLevel(o.log) + + args := []string{ + "--rest_port", port, + "--port", "0", + "--model_name", modelName, + "--model_path", modelPath, + "--task", "text_generation", + "--log_level", logLevel, + } + + return backends.RunBackend(ctx, backends.RunnerConfig{ + BackendName: "OVMS", + Socket: socket, + BinaryPath: o.binaryPath(), + SandboxPath: filepath.Dir(o.binaryPath()), + SandboxConfig: "", + Args: args, + Logger: o.log, + ServerLogWriter: logging.NewWriter(o.serverLog), + }) +} + +func (o *ovms) Status() string { + return o.status +} + +func (o *ovms) GetDiskUsage() (int64, error) { + return 0, nil +} + +func (o *ovms) binaryPath() string { + if o.customBinaryPath != "" { + return o.customBinaryPath + } + return defaultBinaryPath +} + +// resolveOVMSModelPath returns the path OVMS should receive via --model_path. +// Runtime bundles store model files under a dedicated "model" subdirectory. +// Fallback to the bundle root for backward compatibility if it does not exist. +func resolveOVMSModelPath(bundleRoot string) string { + modelDir := filepath.Join(bundleRoot, "model") + if info, err := os.Stat(modelDir); err == nil && info.IsDir() { + return modelDir + } + return bundleRoot +} + +func ovmsLogLevel(logger logging.Logger) string { + if logger.Enabled(context.Background(), slog.LevelDebug) { + return "DEBUG" + } + return "INFO" +} diff --git a/pkg/inference/backends/ovms/ovms_test.go b/pkg/inference/backends/ovms/ovms_test.go new file mode 100644 index 000000000..77739e817 --- /dev/null +++ b/pkg/inference/backends/ovms/ovms_test.go @@ -0,0 +1,49 @@ +package ovms + +import ( + "log/slog" + "os" + "path/filepath" + "testing" + + "github.com/docker/model-runner/pkg/logging" +) + +func TestResolveOVMSModelPath(t *testing.T) { + t.Run("uses model subdirectory when present", func(t *testing.T) { + bundleRoot := t.TempDir() + modelDir := filepath.Join(bundleRoot, "model") + if err := os.MkdirAll(modelDir, 0755); err != nil { + t.Fatalf("mkdir model dir: %v", err) + } + + got := resolveOVMSModelPath(bundleRoot) + if got != modelDir { + t.Fatalf("resolveOVMSModelPath() = %q, want %q", got, modelDir) + } + }) + + t.Run("falls back to bundle root when model subdirectory is missing", func(t *testing.T) { + bundleRoot := t.TempDir() + got := resolveOVMSModelPath(bundleRoot) + if got != bundleRoot { + t.Fatalf("resolveOVMSModelPath() = %q, want %q", got, bundleRoot) + } + }) +} + +func TestOVMSLogLevel(t *testing.T) { + t.Run("debug logger uses DEBUG", func(t *testing.T) { + logger := logging.NewLogger(slog.LevelDebug) + if got := ovmsLogLevel(logger); got != "DEBUG" { + t.Fatalf("ovmsLogLevel() = %q, want %q", got, "DEBUG") + } + }) + + t.Run("non-debug logger uses INFO", func(t *testing.T) { + logger := logging.NewLogger(slog.LevelInfo) + if got := ovmsLogLevel(logger); got != "INFO" { + t.Fatalf("ovmsLogLevel() = %q, want %q", got, "INFO") + } + }) +} diff --git a/pkg/inference/backends/runner.go b/pkg/inference/backends/runner.go index 9cab22c56..6c32ea643 100644 --- a/pkg/inference/backends/runner.go +++ b/pkg/inference/backends/runner.go @@ -23,7 +23,7 @@ type ErrorTransformer func(output string) string // RunnerConfig holds configuration for a backend runner type RunnerConfig struct { - // BackendName is the display name of the backend (e.g., "llama.cpp", "vLLM") + // BackendName is the display name of the backend (e.g., "llama.cpp", "vLLM", "ovms") BackendName string // Socket is the unix socket path Socket string diff --git a/pkg/inference/scheduling/http_handler.go b/pkg/inference/scheduling/http_handler.go index a9f3077b9..bd36bad34 100644 --- a/pkg/inference/scheduling/http_handler.go +++ b/pkg/inference/scheduling/http_handler.go @@ -273,6 +273,11 @@ func (h *HTTPHandler) handleOpenAIInference(w http.ResponseWriter, r *http.Reque // Create a request with the body replaced for forwarding upstream. upstreamRequest := r.Clone(r.Context()) upstreamRequest.Body = io.NopCloser(bytes.NewReader(body)) + // OpenAI-compatible inference endpoints always expect JSON payloads. + // Some clients (for example curl without explicit headers) default to + // application/x-www-form-urlencoded for -d bodies, which breaks OVMS + // routing and causes path-based model resolution. Normalize to JSON. + upstreamRequest.Header.Set("Content-Type", "application/json") // Perform the request. runner.ServeHTTP(w, upstreamRequest) diff --git a/pkg/inference/scheduling/runner.go b/pkg/inference/scheduling/runner.go index ed5ebff9c..64aa9ef82 100644 --- a/pkg/inference/scheduling/runner.go +++ b/pkg/inference/scheduling/runner.go @@ -131,6 +131,11 @@ func run( // Remove the prefix up to the OpenAI API root. pr.Out.URL.Path = trimRequestPathToOpenAIRoot(pr.Out.URL.Path) pr.Out.URL.RawPath = trimRequestPathToOpenAIRoot(pr.Out.URL.RawPath) + // Allow backends to rewrite the proxied path. + if rp, ok := backend.(interface{ RewritePath(string) string }); ok { + pr.Out.URL.Path = rp.RewritePath(pr.Out.URL.Path) + pr.Out.URL.RawPath = rp.RewritePath(pr.Out.URL.RawPath) + } }, } proxy.ModifyResponse = func(resp *http.Response) error { @@ -210,6 +215,12 @@ func run( // wait waits for the runner to be ready. func (r *runner) wait(ctx context.Context) error { + // Determine the health endpoint for this backend. + healthPath := "/health" + if hp, ok := r.backend.(interface{ HealthPath() string }); ok { + healthPath = hp.HealthPath() + } + // Loop and poll for readiness. for p := 0; p < maximumReadinessPings; p++ { select { @@ -222,7 +233,7 @@ func (r *runner) wait(ctx context.Context) error { } // Create and execute a request targeting the health endpoint. // Note: /health returns 503 during model loading, 200 when ready. - readyRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost/health", http.NoBody) + readyRequest, err := http.NewRequestWithContext(ctx, http.MethodGet, "http://localhost"+healthPath, http.NoBody) if err != nil { return fmt.Errorf("readiness request creation failed: %w", err) } From d01862afaca7ed3d469fcf4d4a6bdcb18f83ef62 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 26 May 2026 01:24:23 +0200 Subject: [PATCH 2/4] update --- README.md | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 92fe817cd..12cddc36d 100644 --- a/README.md +++ b/README.md @@ -332,8 +332,13 @@ When the runner starts, it will try to initialize OVMS as an available backend. OVMS_SERVER_PATH=/absolute/path/to/ovms ./model-runner ``` -You can target OVMS explicitly through the backend-prefixed OpenAI-compatible routes: +Create a new model +Use models from HugginFace Hub using repository with OpenVINO format. +```sh +curl http://localhost:13434/models/create -X POST -d '{"from": "hf.co/OpenVINO/Qwen3-0.6B-int4-ov"}' +``` +You can target OVMS explicitly through the backend-prefixed OpenAI-compatible routes: ```sh # List models exposed via OVMS backend routing curl http://localhost:13434/engines/ovms/v1/models @@ -346,6 +351,10 @@ curl http://localhost:13434/engines/ovms/v1/chat/completions -X POST -d '{ ] }' ``` +Delete model +```sh +curl http://localhost:13434/models/hf.co/OpenVINO/Qwen3-0.6B-int4-ov -X DELETE +``` ## API Examples @@ -357,17 +366,17 @@ When running with `docker-run`, you can use regular HTTP requests: ```sh # List all available models -curl http://localhost:13434/models +curl http://localhost:8080/models # Create a new model -curl http://localhost:13434/models/create -X POST -d '{"from": "hf.co/OpenVINO/Qwen3-0.6B-int4-ov"}' +curl http://localhost:8080/models/create -X POST -d '{"from": "ai/smollm2"}' # Get information about a specific model curl http://localhost:13434/models/hf.co/OpenVINO/Qwen3-0.6B-int4-ov # Chat with a model -curl http://localhost:13434/engines/ovms/v1/chat/completions -X POST -d '{ - "model": "hf.co/OpenVINO/Qwen3-0.6B-int4-ov", +curl http://localhost:8080/engines/llama.cpp/v1/chat/completions -X POST -d '{ + "model": "ai/smollm2", "messages": [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": "Hello, how are you?"} @@ -377,6 +386,34 @@ curl http://localhost:13434/engines/ovms/v1/chat/completions -X POST -d '{ # Delete a model curl http://localhost:13434/models/hf.co/OpenVINO/Qwen3-0.6B-int4-ov -X DELETE +# Get metrics +curl http://localhost:8080/metrics +``` +The response will contain the model's reply: + +```json +{ + "id": "chat-12345", + "object": "chat.completion", + "created": 1682456789, + "model": "ai/smollm2", + "choices": [ + { + "index": 0, + "message": { + "role": "assistant", + "content": "I'm doing well, thank you for asking! How can I assist you today?" + }, + "finish_reason": "stop" + } + ], + "usage": { + "prompt_tokens": 24, + "completion_tokens": 16, + "total_tokens": 40 + } +} +``` ### Features From 84ebb224545566fdec1c8896f4ff153e4afd33c5 Mon Sep 17 00:00:00 2001 From: "Trawinski, Dariusz" Date: Tue, 26 May 2026 01:28:55 +0200 Subject: [PATCH 3/4] Update pkg/inference/backends/ovms/ovms.go Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- pkg/inference/backends/ovms/ovms.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/inference/backends/ovms/ovms.go b/pkg/inference/backends/ovms/ovms.go index 33664cca1..b35090050 100644 --- a/pkg/inference/backends/ovms/ovms.go +++ b/pkg/inference/backends/ovms/ovms.go @@ -71,7 +71,7 @@ func (o *ovms) RewritePath(path string) string { func (o *ovms) Install(ctx context.Context, _ *http.Client) error { binary := o.binaryPath() - if _, err := os.Stat(binary); err != nil { + if _, err := exec.LookPath(binary); err != nil { o.status = inference.FormatNotInstalled("") return ErrOVMSNotFound } From 941c31e99c617103c7005fa45f696666ab81337b Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 26 May 2026 01:39:49 +0200 Subject: [PATCH 4/4] update readme --- README.md | 8 +++++--- pkg/inference/backends/ovms/ovms.go | 13 +++++++++--- pkg/inference/backends/ovms/ovms_test.go | 25 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 12cddc36d..27f7a1cf3 100644 --- a/README.md +++ b/README.md @@ -323,13 +323,15 @@ The vLLM wheels are sourced from the official vLLM GitHub Releases at `https://g Docker Model Runner can also run an OVMS backend. -- Default OVMS binary path: `./ovms/bin/ovms` +- Default OVMS binary: resolved from `PATH` (looks up `ovms`) - Override binary path with: `OVMS_SERVER_PATH` -When the runner starts, it will try to initialize OVMS as an available backend. If you are running from source and want to use a custom OVMS binary, set: +OVMS can be installed based on this [guide](https://docs.openvino.ai/2026/model-server/ovms_docs_deploying_server_baremetal.html). Minimal version is 2026.2. + +When the runner starts, it will try to initialize OVMS as an available backend. ```sh -OVMS_SERVER_PATH=/absolute/path/to/ovms ./model-runner +MODEL_RUNNER_PORT=13434 ./model-runner ``` Create a new model diff --git a/pkg/inference/backends/ovms/ovms.go b/pkg/inference/backends/ovms/ovms.go index b35090050..7a6f133ae 100644 --- a/pkg/inference/backends/ovms/ovms.go +++ b/pkg/inference/backends/ovms/ovms.go @@ -22,8 +22,6 @@ import ( const ( // Name is the backend name. Name = "ovms" - - defaultBinaryPath = "./ovms/bin/ovms" ) var ErrOVMSNotFound = errors.New("ovms binary not found") @@ -71,6 +69,11 @@ func (o *ovms) RewritePath(path string) string { func (o *ovms) Install(ctx context.Context, _ *http.Client) error { binary := o.binaryPath() + if o.customBinaryPath != "" { + o.log.Info("OVMS binary configured via OVMS_SERVER_PATH", "path", binary) + } else if resolved, err := exec.LookPath(Name); err == nil { + o.log.Info("OVMS binary resolved from PATH", "path", resolved) + } if _, err := exec.LookPath(binary); err != nil { o.status = inference.FormatNotInstalled("") return ErrOVMSNotFound @@ -154,7 +157,11 @@ func (o *ovms) binaryPath() string { if o.customBinaryPath != "" { return o.customBinaryPath } - return defaultBinaryPath + if path, err := exec.LookPath(Name); err == nil { + return path + } + // Keep command name as a last resort so error reporting remains clear. + return Name } // resolveOVMSModelPath returns the path OVMS should receive via --model_path. diff --git a/pkg/inference/backends/ovms/ovms_test.go b/pkg/inference/backends/ovms/ovms_test.go index 77739e817..3a67e4953 100644 --- a/pkg/inference/backends/ovms/ovms_test.go +++ b/pkg/inference/backends/ovms/ovms_test.go @@ -9,6 +9,31 @@ import ( "github.com/docker/model-runner/pkg/logging" ) +func TestBinaryPath(t *testing.T) { + t.Run("uses custom binary path when provided", func(t *testing.T) { + o := &ovms{customBinaryPath: "/tmp/custom-ovms"} + if got := o.binaryPath(); got != "/tmp/custom-ovms" { + t.Fatalf("binaryPath() = %q, want %q", got, "/tmp/custom-ovms") + } + }) + + t.Run("uses ovms from PATH when custom path is empty", func(t *testing.T) { + binDir := t.TempDir() + binary := filepath.Join(binDir, Name) + if err := os.WriteFile(binary, []byte("#!/bin/sh\nexit 0\n"), 0o755); err != nil { + t.Fatalf("write fake ovms binary: %v", err) + } + + originalPath := os.Getenv("PATH") + t.Setenv("PATH", binDir+string(os.PathListSeparator)+originalPath) + + o := &ovms{} + if got := o.binaryPath(); got != binary { + t.Fatalf("binaryPath() = %q, want %q", got, binary) + } + }) +} + func TestResolveOVMSModelPath(t *testing.T) { t.Run("uses model subdirectory when present", func(t *testing.T) { bundleRoot := t.TempDir()