InfiniTensor · whjthu · Jun 18, 2026
diff --git a/csrc/models/backend_plugin_loader.cpp b/csrc/models/backend_plugin_loader.cpp
@@ -0,0 +1,100 @@
+#include "backend_plugin_loader.hpp"
+
+#include <cstdlib>
+#include <dlfcn.h>
+#include <mutex>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+
+namespace infinilm::models {
+namespace {
+
+using PluginInitFn = void (*)();
+
+std::mutex &loader_mutex() {
+    static std::mutex mutex;
+    return mutex;
+}
+
+std::unordered_map<std::string, void *> &loaded_handles() {
+    static std::unordered_map<std::string, void *> handles;
+    return handles;
+}
+
+std::string trim(std::string value) {
+    const auto begin = value.find_first_not_of(" \t\n\r");
+    if (begin == std::string::npos) {
+        return "";
+    }
+    const auto end = value.find_last_not_of(" \t\n\r");
+    return value.substr(begin, end - begin + 1);
+}
+
+std::vector<std::string> split_plugins(const char *env_value) {
+    std::vector<std::string> plugins;
+    if (env_value == nullptr || *env_value == '\0') {
+        return plugins;
+    }
+
+    std::stringstream stream(env_value);
+    std::string item;
+    while (std::getline(stream, item, ',')) {
+        item = trim(item);
+        if (!item.empty()) {
+            plugins.push_back(item);
+        }
+    }
+    return plugins;
+}
+
+} // namespace
+
+void load_backend_plugin(const std::string &plugin_path) {
+    const std::string path = trim(plugin_path);
+    if (path.empty()) {
+        return;
+    }
+
+    std::lock_guard<std::mutex> lock(loader_mutex());
+    auto &handles = loaded_handles();
+    if (handles.find(path) != handles.end()) {
+        return;
+    }
+
+    void *handle = dlopen(path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    if (handle == nullptr) {
+        const char *error = dlerror();
+        throw std::runtime_error(
+            "infinilm::models::load_backend_plugin: failed to load " + path
+            + ": " + (error == nullptr ? "unknown dlopen error" : std::string(error)));
+    }
+
+    dlerror();
+    auto init_fn = reinterpret_cast<PluginInitFn>(dlsym(handle, "infinilm_backend_plugin_init"));
+    const char *dlsym_error = dlerror();
+    if (dlsym_error == nullptr && init_fn != nullptr) {
+        init_fn();
+    }
+
+    handles[path] = handle;
+}
+
+void load_backend_plugins_from_env() {
+    for (const auto &plugin : split_plugins(std::getenv("INFINILM_BACKEND_PLUGINS"))) {
+        load_backend_plugin(plugin);
+    }
+}
+
+std::vector<std::string> loaded_backend_plugins() {
+    std::lock_guard<std::mutex> lock(loader_mutex());
+    std::vector<std::string> plugins;
+    plugins.reserve(loaded_handles().size());
+    for (const auto &[path, _] : loaded_handles()) {
+        plugins.push_back(path);
+    }
+    return plugins;
+}
+
+} // namespace infinilm::models
diff --git a/csrc/models/backend_plugin_loader.hpp b/csrc/models/backend_plugin_loader.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+namespace infinilm::models {
+
+/**
+ * Load one out-of-tree backend plugin shared object.
+ *
+ * The plugin may either rely on static initializers that call
+ * `register_causal_lm_model` / `register_model_config`, or export an optional
+ * `extern "C" void infinilm_backend_plugin_init()` function. Loading is
+ * idempotent for each path.
+ */
+void load_backend_plugin(const std::string &plugin_path);
+
+/**
+ * Load backend plugins from `INFINILM_BACKEND_PLUGINS`.
+ *
+ * The environment variable accepts comma-separated shared object paths.
+ */
+void load_backend_plugins_from_env();
+
+/**
+ * Return plugin paths that have already been loaded.
+ */
+std::vector<std::string> loaded_backend_plugins();
+
+} // namespace infinilm::models
diff --git a/csrc/models/infinilm_model.cpp b/csrc/models/infinilm_model.cpp
@@ -18,6 +18,13 @@ void InfinilmModel::reset_cache(const cache::CacheConfig *cache_config) {
     kv_cache_vec = std::move(default_allocate_kv_cache_tensors(cache_config, model_config_, attention_backend));
 }
 
+void InfinilmModel::load_parameters_no_sync(
+    const std::unordered_map<std::string, infinicore::Tensor> &params) {
+    for (const auto &[name, param] : params) {
+        load_parameter(name, param);
+    }
+}
+
 std::vector<infinicore::Tensor> InfinilmModel::default_allocate_kv_cache_tensors(
     const cache::CacheConfig *cache_config,
     const std::shared_ptr<infinilm::config::ModelConfig> &text_config,

diff --git a/csrc/models/infinilm_model.hpp b/csrc/models/infinilm_model.hpp
@@ -7,6 +7,7 @@
 #include "infinicore/tensor.hpp"
 
 #include <optional>
+#include <unordered_map>
 #include <vector>
 
 namespace infinilm {
@@ -57,6 +58,8 @@ class InfinilmModel : public infinicore::nn::Module {
         return cache_config_.get();
     }
 
+    void load_parameters_no_sync(
+        const std::unordered_map<std::string, infinicore::Tensor> &params);
     void process_weights_after_loading();
     void reset_runtime_state() const;
 

diff --git a/csrc/pybind11/bindings.cc b/csrc/pybind11/bindings.cc
@@ -2,6 +2,7 @@
 
 #include "cache/cache.hpp"
 #include "engine/engine.hpp"
+#include "../models/backend_plugin_loader.hpp"
 
 namespace py = pybind11;
 
@@ -12,4 +13,11 @@ PYBIND11_MODULE(_infinilm, m) {
     infinilm::engine::bind_hook_registry(m);
     infinilm::engine::distributed::bind_dist_config(m);
     infinilm::engine::bind_infer_engine(m);
+
+    m.def("load_backend_plugin", &infinilm::models::load_backend_plugin,
+          "Load one InfiniLM C++ backend plugin shared object.");
+    m.def("load_backend_plugins_from_env", &infinilm::models::load_backend_plugins_from_env,
+          "Load InfiniLM C++ backend plugins from INFINILM_BACKEND_PLUGINS.");
+    m.def("loaded_backend_plugins", &infinilm::models::loaded_backend_plugins,
+          "Return paths of loaded InfiniLM C++ backend plugins.");
 }
diff --git a/python/infinilm/__init__.py b/python/infinilm/__init__.py
@@ -1,17 +1,41 @@
-from .models import AutoLlamaModel
-from . import distributed
-from . import cache
-from . import llm
-from . import base_config
-
-from .llm import (
-    LLM,
-    AsyncLLMEngine,
-    SamplingParams,
-    RequestOutput,
-    TokenOutput,
+from importlib import import_module
+
+from .plugins import (
+    ModelSpec,
+    load_plugin,
+    load_plugins,
+    register_model,
+    registered_model_types,
 )
 
+
+_LAZY_ATTRS = {
+    "AutoLlamaModel": ("infinilm.models", "AutoLlamaModel"),
+    "LLM": ("infinilm.llm", "LLM"),
+    "AsyncLLMEngine": ("infinilm.llm", "AsyncLLMEngine"),
+    "SamplingParams": ("infinilm.llm", "SamplingParams"),
+    "RequestOutput": ("infinilm.llm", "RequestOutput"),
+    "TokenOutput": ("infinilm.llm", "TokenOutput"),
+}
+
+_LAZY_MODULES = {"distributed", "cache", "llm", "base_config"}
+
+
+def __getattr__(name):
+    if name in _LAZY_MODULES:
+        module = import_module(f".{name}", __name__)
+        globals()[name] = module
+        return module
+
+    target = _LAZY_ATTRS.get(name)
+    if target is not None:
+        module_name, attr_name = target
+        value = getattr(import_module(module_name), attr_name)
+        globals()[name] = value
+        return value
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+
 __all__ = [
     "AutoLlamaModel",
     "distributed",
@@ -24,4 +48,10 @@
     "SamplingParams",
     "RequestOutput",
     "TokenOutput",
+    # Out-of-tree model plugins
+    "ModelSpec",
+    "load_plugin",
+    "load_plugins",
+    "register_model",
+    "registered_model_types",
 ]
diff --git a/python/infinilm/backend_plugins.py b/python/infinilm/backend_plugins.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+import os
+from collections.abc import Sequence
+
+
+def _split_plugin_list(value: str | None) -> list[str]:
+    if not value:
+        return []
+    return [item.strip() for item in value.split(",") if item.strip()]
+
+
+def _backend_module():
+    from infinilm.lib import _infinilm
+
+    return _infinilm
+
+
+def load_backend_plugin(plugin: str | os.PathLike[str]) -> None:
+    """Load one InfiniLM C++ backend plugin shared object."""
+
+    _backend_module().load_backend_plugin(os.fspath(plugin))
+
+
+def load_backend_plugins(plugins: Sequence[str | os.PathLike[str]] | str | None = None) -> tuple[str, ...]:
+    """Load explicitly requested InfiniLM C++ backend plugins."""
+
+    requested: list[str] = []
+    if isinstance(plugins, (str, os.PathLike)):
+        requested.extend(_split_plugin_list(os.fspath(plugins)))
+    elif plugins:
+        requested.extend(os.fspath(plugin) for plugin in plugins)
+
+    for plugin in requested:
+        load_backend_plugin(plugin)
+    return loaded_backend_plugins()
+
+
+def load_backend_plugins_from_env() -> tuple[str, ...]:
+    """Load backend plugins from `INFINILM_BACKEND_PLUGINS`.
+
+    This is an explicit compatibility helper for command-line or embedding
+    workflows. Core config/model factories do not read environment variables
+    implicitly.
+    """
+
+    return load_backend_plugins(os.environ.get("INFINILM_BACKEND_PLUGINS"))
+
+
+def loaded_backend_plugins() -> tuple[str, ...]:
+    """Return paths of C++ backend plugins already loaded in this process."""
+
+    return tuple(_backend_module().loaded_backend_plugins())
diff --git a/python/infinilm/infer_engine.py b/python/infinilm/infer_engine.py
@@ -3,9 +3,11 @@
 
 import infinicore
 
-from infinilm.cache import PagedKVCacheConfig
+from infinilm.cache import StaticKVCacheConfig, PagedKVCacheConfig
+from infinilm.backend_plugins import load_backend_plugins
 from infinilm.distributed import DistConfig
 from infinilm.lib import _infinilm
+from infinilm.plugins import adapt_config, load_plugins
 
 from .modeling_utils import parse_dtype
 from .exception_utils import handle_oom_and_exit
@@ -67,10 +69,14 @@ def __init__(
         enable_graph_compiling=False,
         attention_backend="default",
         kv_cache_dtype=None,
+        backend_plugins=None,
         use_mla=False,
     ):
-        self.hf_config = read_hf_config(model_path)
+        load_plugins()
+        self.hf_config = adapt_config(read_hf_config(model_path))
         self.hf_generation_config = read_hf_generation_config(model_path)
+        load_backend_plugins(self.hf_config.get("_infinilm_backend_plugins"))
+        load_backend_plugins(backend_plugins)
 
         if device is None:
             device = infinicore.device()

diff --git a/python/infinilm/lib/__init__.py b/python/infinilm/lib/__init__.py
@@ -6,6 +6,9 @@
 import os
 from pathlib import Path
 
+# Register shared pybind11 types used by the InfiniLM extension.
+import infinicore  # noqa: F401
+
 # Ensure the directory containing this __init__.py is on sys.path
 # This allows importing the .so file from the same directory
 _lib_dir = Path(__file__).parent