From 69f901f5f11e54c228ddedc85d38fd8b26836d1f Mon Sep 17 00:00:00 2001
From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com>
Date: Thu, 7 May 2026 15:12:51 +0800
Subject: [PATCH 1/4] Add Qwen3.5 architecture support

---
 mergekit/architecture/__init__.py |   6 +
 mergekit/architecture/qwen35.py   | 359 ++++++++++++++++++++++++++++++
 tests/test_qwen35_architecture.py | 227 +++++++++++++++++++
 3 files changed, 592 insertions(+)
 create mode 100644 mergekit/architecture/qwen35.py
 create mode 100644 tests/test_qwen35_architecture.py

diff --git a/mergekit/architecture/__init__.py b/mergekit/architecture/__init__.py
index c731e58b..1b8de7cb 100644
--- a/mergekit/architecture/__init__.py
+++ b/mergekit/architecture/__init__.py
@@ -23,6 +23,10 @@
     MixtralModuleArchitecture,
     Qwen3MoeModuleArchitecture,
 )
+from mergekit.architecture.qwen35 import (
+    QWEN35_ARCHITECTURES,
+    qwen35_architecture_for_config,
+)
 from mergekit.options import MergeOptions
 
 if TYPE_CHECKING:
@@ -66,6 +70,8 @@ def arch_info_for_config(config: PretrainedConfig) -> Optional[ModelArchitecture
             architectures=[arch_name],
             model_type="glm4_moe",
         )
+    elif arch_name in QWEN35_ARCHITECTURES:
+        return qwen35_architecture_for_config(config)
     elif arch_name in NAME_TO_ARCH:
         candidates = list(NAME_TO_ARCH[arch_name])
         if len(candidates) == 1:
diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py
new file mode 100644
index 00000000..a3952537
--- /dev/null
+++ b/mergekit/architecture/qwen35.py
@@ -0,0 +1,359 @@
+# Copyright (C) 2025 Arcee AI
+# SPDX-License-Identifier: LGPL-3.0-only
+
+from typing import List, Optional
+
+from pydantic import BaseModel
+from transformers import PretrainedConfig
+
+from mergekit.architecture.base import (
+    ModelArchitecture,
+    ModuleArchitecture,
+    ModuleDefinition,
+    WeightInfo,
+)
+from mergekit.common import get_config_value
+
+
+QWEN35_DENSE_ARCHITECTURES = {
+    "Qwen3_5ForConditionalGeneration",
+    "Qwen3_5ForCausalLM",
+}
+QWEN35_MOE_ARCHITECTURES = {
+    "Qwen3_5MoeForConditionalGeneration",
+    "Qwen3_5MoeForCausalLM",
+}
+QWEN35_ARCHITECTURES = QWEN35_DENSE_ARCHITECTURES | QWEN35_MOE_ARCHITECTURES
+
+
+def _text_config(config: PretrainedConfig):
+    return getattr(config, "text_config", config)
+
+
+def _cfg(config: PretrainedConfig, key: str, default=None):
+    try:
+        return get_config_value(config, key)
+    except Exception:
+        return default
+
+
+def _is_full_attention(config: PretrainedConfig, index: int) -> bool:
+    layer_types = getattr(_text_config(config), "layer_types", None)
+    if layer_types and index < len(layer_types):
+        return layer_types[index] == "full_attention"
+    # Qwen3.5 defaults to three linear-attention layers followed by one full-attention layer.
+    return index % 4 == 3
+
+
+class Qwen35LanguageModuleArchitecture(ModuleArchitecture, BaseModel, frozen=True):
+    """Text decoder for Qwen3.5 dense and MoE checkpoints.
+
+    Official Qwen3.5 repos are image-text-to-text wrappers whose language weights live
+    under ``model.language_model``. Text-only exports use the usual ``model`` prefix.
+    """
+
+    root: str
+    is_moe: bool = False
+    num_experts: Optional[int] = None
+
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return [WeightInfo(name=f"{self.root}.embed_tokens.weight", is_embed=True)]
+
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return [
+            WeightInfo(name=f"{self.root}.norm.weight"),
+            WeightInfo(
+                name="lm_head.weight",
+                is_embed=True,
+                optional=True,
+                tied_names=(f"{self.root}.embed_tokens.weight",),
+            ),
+        ]
+
+    def num_layers_config_key(self) -> str:
+        return (
+            "text_config.num_hidden_layers"
+            if self.root == "model.language_model"
+            else "num_hidden_layers"
+        )
+
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        prefix = f"{self.root}.layers.{index}"
+        res = [WeightInfo(name=f"{prefix}.input_layernorm.weight")]
+
+        if _is_full_attention(config, index):
+            res.extend(
+                WeightInfo(name=f"{prefix}.self_attn.{name}")
+                for name in (
+                    "q_proj.weight",
+                    "k_proj.weight",
+                    "v_proj.weight",
+                    "o_proj.weight",
+                    "q_norm.weight",
+                    "k_norm.weight",
+                )
+            )
+            if getattr(_text_config(config), "attention_bias", False):
+                res.extend(
+                    WeightInfo(name=f"{prefix}.self_attn.{name}", optional=True)
+                    for name in (
+                        "q_proj.bias",
+                        "k_proj.bias",
+                        "v_proj.bias",
+                        "o_proj.bias",
+                    )
+                )
+        else:
+            res.extend(
+                WeightInfo(name=f"{prefix}.linear_attn.{name}")
+                for name in (
+                    "dt_bias",
+                    "A_log",
+                    "conv1d.weight",
+                    "norm.weight",
+                    "out_proj.weight",
+                    "in_proj_qkv.weight",
+                    "in_proj_z.weight",
+                    "in_proj_b.weight",
+                    "in_proj_a.weight",
+                )
+            )
+
+        if self.is_moe:
+            res.append(WeightInfo(name=f"{prefix}.mlp.gate.weight"))
+            res.extend(
+                WeightInfo(name=f"{prefix}.mlp.{name}", optional=True)
+                for name in (
+                    "experts.gate_up_proj",
+                    "experts.down_proj",
+                )
+            )
+            for expert_idx in range(self.num_experts or 0):
+                for proj in ("gate_proj", "up_proj", "down_proj"):
+                    res.append(
+                        WeightInfo(
+                            name=f"{prefix}.mlp.experts.{expert_idx}.{proj}.weight",
+                            optional=True,
+                        )
+                    )
+            if getattr(_text_config(config), "shared_expert_intermediate_size", None):
+                res.extend(
+                    WeightInfo(name=f"{prefix}.mlp.{name}")
+                    for name in (
+                        "shared_expert.gate_proj.weight",
+                        "shared_expert.up_proj.weight",
+                        "shared_expert.down_proj.weight",
+                        "shared_expert_gate.weight",
+                    )
+                )
+        else:
+            res.extend(
+                WeightInfo(name=f"{prefix}.mlp.{name}")
+                for name in (
+                    "gate_proj.weight",
+                    "up_proj.weight",
+                    "down_proj.weight",
+                )
+            )
+
+        res.append(WeightInfo(name=f"{prefix}.post_attention_layernorm.weight"))
+        return res
+
+
+class Qwen35MtpModuleArchitecture(ModuleArchitecture, BaseModel, frozen=True):
+    is_moe: bool = False
+    num_experts: Optional[int] = None
+    num_layers_key: str = "text_config.mtp_num_hidden_layers"
+
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return [
+            WeightInfo(name="mtp.fc.weight", optional=True),
+            WeightInfo(name="mtp.norm.weight", optional=True),
+            WeightInfo(name="mtp.pre_fc_norm_embedding.weight", optional=True),
+            WeightInfo(name="mtp.pre_fc_norm_hidden.weight", optional=True),
+        ]
+
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return []
+
+    def num_layers_config_key(self) -> Optional[str]:
+        return self.num_layers_key
+
+    def num_layers(self, config: PretrainedConfig) -> int:
+        return int(
+            _cfg(
+                config,
+                self.num_layers_key,
+                _cfg(
+                    config,
+                    "text_config.mtp_num_hidden_layers",
+                    _cfg(config, "mtp_num_hidden_layers", 0),
+                ),
+            )
+            or 0
+        )
+
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        prefix = f"mtp.layers.{index}"
+        res = [
+            WeightInfo(name=f"{prefix}.input_layernorm.weight", optional=True),
+            WeightInfo(name=f"{prefix}.self_attn.q_proj.weight", optional=True),
+            WeightInfo(name=f"{prefix}.self_attn.k_proj.weight", optional=True),
+            WeightInfo(name=f"{prefix}.self_attn.v_proj.weight", optional=True),
+            WeightInfo(name=f"{prefix}.self_attn.o_proj.weight", optional=True),
+            WeightInfo(name=f"{prefix}.self_attn.q_norm.weight", optional=True),
+            WeightInfo(name=f"{prefix}.self_attn.k_norm.weight", optional=True),
+        ]
+
+        if self.is_moe:
+            num_experts = int(
+                self.num_experts or getattr(_text_config(config), "num_experts", 0) or 0
+            )
+            res.append(WeightInfo(name=f"{prefix}.mlp.gate.weight", optional=True))
+            for expert_idx in range(num_experts):
+                for proj in ("gate_proj", "up_proj", "down_proj"):
+                    res.append(
+                        WeightInfo(
+                            name=f"{prefix}.mlp.experts.{expert_idx}.{proj}.weight",
+                            optional=True,
+                        )
+                    )
+            if getattr(_text_config(config), "shared_expert_intermediate_size", None):
+                res.extend(
+                    WeightInfo(name=f"{prefix}.mlp.{name}", optional=True)
+                    for name in (
+                        "shared_expert.gate_proj.weight",
+                        "shared_expert.up_proj.weight",
+                        "shared_expert.down_proj.weight",
+                        "shared_expert_gate.weight",
+                    )
+                )
+        else:
+            res.extend(
+                WeightInfo(name=f"{prefix}.mlp.{name}", optional=True)
+                for name in (
+                    "gate_proj.weight",
+                    "up_proj.weight",
+                    "down_proj.weight",
+                )
+            )
+
+        res.append(
+            WeightInfo(name=f"{prefix}.post_attention_layernorm.weight", optional=True)
+        )
+        return res
+
+
+class Qwen35VisionModuleArchitecture(ModuleArchitecture, BaseModel, frozen=True):
+    def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return [
+            WeightInfo(name="model.visual.patch_embed.proj.weight", is_embed=True),
+            WeightInfo(name="model.visual.patch_embed.proj.bias", optional=True),
+            WeightInfo(name="model.visual.pos_embed.weight", is_embed=True),
+            WeightInfo(name="model.visual.merger.norm.weight"),
+            WeightInfo(name="model.visual.merger.norm.bias", optional=True),
+            WeightInfo(name="model.visual.merger.linear_fc1.weight"),
+            WeightInfo(name="model.visual.merger.linear_fc1.bias", optional=True),
+            WeightInfo(name="model.visual.merger.linear_fc2.weight"),
+            WeightInfo(name="model.visual.merger.linear_fc2.bias", optional=True),
+        ]
+
+    def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
+        return []
+
+    def num_layers_config_key(self) -> str:
+        return "vision_config.depth"
+
+    def layer_weights(
+        self, index: int, config: PretrainedConfig
+    ) -> Optional[List[WeightInfo]]:
+        prefix = f"model.visual.blocks.{index}"
+        return [
+            WeightInfo(name=f"{prefix}.norm1.weight"),
+            WeightInfo(name=f"{prefix}.norm1.bias", optional=True),
+            WeightInfo(name=f"{prefix}.norm2.weight"),
+            WeightInfo(name=f"{prefix}.norm2.bias", optional=True),
+            WeightInfo(name=f"{prefix}.attn.qkv.weight"),
+            WeightInfo(name=f"{prefix}.attn.qkv.bias", optional=True),
+            WeightInfo(name=f"{prefix}.attn.proj.weight"),
+            WeightInfo(name=f"{prefix}.attn.proj.bias", optional=True),
+            WeightInfo(name=f"{prefix}.mlp.linear_fc1.weight"),
+            WeightInfo(name=f"{prefix}.mlp.linear_fc1.bias", optional=True),
+            WeightInfo(name=f"{prefix}.mlp.linear_fc2.weight"),
+            WeightInfo(name=f"{prefix}.mlp.linear_fc2.bias", optional=True),
+        ]
+
+
+def qwen35_architecture_for_config(config: PretrainedConfig) -> ModelArchitecture:
+    arch_name = (
+        config.architectures[0] if getattr(config, "architectures", None) else ""
+    )
+    is_moe = arch_name in QWEN35_MOE_ARCHITECTURES or config.model_type in {
+        "qwen3_5_moe",
+        "qwen3_5_moe_text",
+    }
+    num_experts = (
+        int(getattr(_text_config(config), "num_experts", 0) or 0) if is_moe else None
+    )
+    is_multimodal_wrapper = arch_name.endswith("ForConditionalGeneration") and hasattr(
+        config, "vision_config"
+    )
+    root = "model.language_model" if is_multimodal_wrapper else "model"
+    mtp_num_layers_key = (
+        "text_config.mtp_num_hidden_layers"
+        if is_multimodal_wrapper
+        else "mtp_num_hidden_layers"
+    )
+
+    modules = {
+        "text_decoder" if is_multimodal_wrapper else "default": ModuleDefinition(
+            architecture=Qwen35LanguageModuleArchitecture(
+                root=root,
+                is_moe=is_moe,
+                num_experts=num_experts,
+            )
+        )
+    }
+    if is_multimodal_wrapper:
+        modules["vision_tower"] = ModuleDefinition(
+            architecture=Qwen35VisionModuleArchitecture()
+        )
+        if _cfg(config, "text_config.mtp_num_hidden_layers", 0):
+            modules["mtp"] = ModuleDefinition(
+                architecture=Qwen35MtpModuleArchitecture(
+                    is_moe=is_moe,
+                    num_experts=num_experts,
+                    num_layers_key=mtp_num_layers_key,
+                )
+            )
+    else:
+        if _cfg(config, "mtp_num_hidden_layers", 0):
+            modules["mtp"] = ModuleDefinition(
+                architecture=Qwen35MtpModuleArchitecture(
+                    is_moe=is_moe,
+                    num_experts=num_experts,
+                    num_layers_key=mtp_num_layers_key,
+                )
+            )
+
+    return ModelArchitecture(
+        modules=modules,
+        architectures=[arch_name] if arch_name else [],
+        model_type=config.model_type,
+        tagalong_files=(
+            [
+                "preprocessor_config.json",
+                "video_preprocessor_config.json",
+                "vocab.json",
+            ]
+            if is_multimodal_wrapper
+            else None
+        ),
+        vocab_size_config_key=(
+            "text_config.vocab_size" if is_multimodal_wrapper else "vocab_size"
+        ),
+    )
diff --git a/tests/test_qwen35_architecture.py b/tests/test_qwen35_architecture.py
new file mode 100644
index 00000000..96d4520c
--- /dev/null
+++ b/tests/test_qwen35_architecture.py
@@ -0,0 +1,227 @@
+import tempfile
+
+import pytest
+
+from mergekit.architecture import arch_info_for_config
+from mergekit.common import set_config_value
+from mergekit.config import InputModelDefinition, MergeConfiguration
+from tests.common import run_and_check_merge
+
+qwen35 = pytest.importorskip("transformers.models.qwen3_5")
+qwen35_moe = pytest.importorskip("transformers.models.qwen3_5_moe")
+
+from transformers.models.qwen3_5.configuration_qwen3_5 import (  # noqa: E402
+    Qwen3_5Config,
+    Qwen3_5TextConfig,
+    Qwen3_5VisionConfig,
+)
+from transformers.models.qwen3_5.modeling_qwen3_5 import (  # noqa: E402
+    Qwen3_5ForCausalLM,
+    Qwen3_5ForConditionalGeneration,
+)
+from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import (  # noqa: E402
+    Qwen3_5MoeConfig,
+    Qwen3_5MoeTextConfig,
+    Qwen3_5MoeVisionConfig,
+)
+from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import (  # noqa: E402
+    Qwen3_5MoeForCausalLM,
+    Qwen3_5MoeForConditionalGeneration,
+)
+
+
+def _dense_config():
+    text = Qwen3_5TextConfig(
+        vocab_size=64,
+        hidden_size=32,
+        intermediate_size=48,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=8,
+        linear_key_head_dim=8,
+        linear_value_head_dim=8,
+        linear_num_key_heads=4,
+        linear_num_value_heads=4,
+        layer_types=[
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+        ],
+        mtp_num_hidden_layers=1,
+        mtp_use_dedicated_embeddings=False,
+        tie_word_embeddings=True,
+    )
+    vision = Qwen3_5VisionConfig(
+        depth=2,
+        hidden_size=16,
+        intermediate_size=32,
+        num_heads=2,
+        out_hidden_size=32,
+        num_position_embeddings=16,
+    )
+    return Qwen3_5Config(
+        architectures=["Qwen3_5ForConditionalGeneration"],
+        text_config=text,
+        vision_config=vision,
+        tie_word_embeddings=True,
+    )
+
+
+def _moe_config():
+    text = Qwen3_5MoeTextConfig(
+        vocab_size=64,
+        hidden_size=32,
+        num_hidden_layers=4,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        head_dim=8,
+        linear_key_head_dim=8,
+        linear_value_head_dim=8,
+        linear_num_key_heads=4,
+        linear_num_value_heads=4,
+        moe_intermediate_size=8,
+        shared_expert_intermediate_size=8,
+        num_experts=4,
+        num_experts_per_tok=2,
+        layer_types=[
+            "linear_attention",
+            "linear_attention",
+            "linear_attention",
+            "full_attention",
+        ],
+        mtp_num_hidden_layers=1,
+        mtp_use_dedicated_embeddings=False,
+        tie_word_embeddings=True,
+    )
+    vision = Qwen3_5MoeVisionConfig(
+        depth=2,
+        hidden_size=16,
+        intermediate_size=32,
+        num_heads=2,
+        out_hidden_size=32,
+        num_position_embeddings=16,
+    )
+    return Qwen3_5MoeConfig(
+        architectures=["Qwen3_5MoeForConditionalGeneration"],
+        text_config=text,
+        vision_config=vision,
+        tie_word_embeddings=True,
+    )
+
+
+def _dense_text_config():
+    config = _dense_config().text_config
+    config.architectures = ["Qwen3_5ForCausalLM"]
+    return config
+
+
+def _moe_text_config():
+    config = _moe_config().text_config
+    config.architectures = ["Qwen3_5MoeForCausalLM"]
+    return config
+
+
+def _save_model(model_cls, config, path):
+    model = model_cls(config)
+    model.save_pretrained(path, safe_serialization=True)
+    return str(path), set(model.state_dict().keys())
+
+
+def _arch_names(config):
+    arch = arch_info_for_config(config)
+    return {weight.name for weight in arch.all_weights(config)}
+
+
+def test_qwen35_dense_architecture_covers_transformers_keys():
+    config = _dense_config()
+    model = Qwen3_5ForConditionalGeneration(config)
+    state_keys = set(model.state_dict().keys())
+    arch_keys = _arch_names(config)
+
+    assert state_keys <= arch_keys
+    assert "model.language_model.layers.0.linear_attn.in_proj_qkv.weight" in arch_keys
+    assert "model.language_model.layers.3.self_attn.q_proj.weight" in arch_keys
+    assert "mtp.layers.0.mlp.gate_proj.weight" in arch_keys
+    assert "mtp.fc.weight" in arch_keys
+
+
+@pytest.mark.parametrize(
+    ("config", "model_cls"),
+    [
+        (_dense_config(), Qwen3_5ForConditionalGeneration),
+        (_moe_config(), Qwen3_5MoeForConditionalGeneration),
+    ],
+)
+def test_qwen35_full_attention_bias_covers_output_projection_bias(
+    config, model_cls
+):
+    config.text_config.attention_bias = True
+    model = model_cls(config)
+    state_keys = set(model.state_dict().keys())
+    arch_keys = _arch_names(config)
+
+    assert "model.language_model.layers.3.self_attn.o_proj.bias" in state_keys
+    assert state_keys <= arch_keys
+
+
+def test_qwen35_moe_architecture_covers_transformers_keys_and_mtp_experts():
+    config = _moe_config()
+    model = Qwen3_5MoeForConditionalGeneration(config)
+    state_keys = set(model.state_dict().keys())
+    arch_keys = _arch_names(config)
+
+    assert state_keys <= arch_keys
+    assert "model.language_model.layers.0.mlp.experts.gate_up_proj" in arch_keys
+    assert "model.language_model.layers.0.mlp.shared_expert_gate.weight" in arch_keys
+    assert "model.language_model.layers.3.self_attn.q_proj.weight" in arch_keys
+    assert "mtp.layers.0.mlp.experts.3.down_proj.weight" in arch_keys
+    assert "mtp.layers.0.mlp.shared_expert_gate.weight" in arch_keys
+
+
+@pytest.mark.parametrize(
+    ("config", "model_cls"),
+    [
+        (_dense_text_config(), Qwen3_5ForCausalLM),
+        (_moe_text_config(), Qwen3_5MoeForCausalLM),
+    ],
+)
+def test_qwen35_text_only_architecture_uses_top_level_mtp_config_key(
+    config, model_cls
+):
+    model = model_cls(config)
+    arch = arch_info_for_config(config)
+    arch_keys = {weight.name for weight in arch.all_weights(config)}
+    mtp_num_layers_key = arch.modules["mtp"].architecture.num_layers_config_key()
+
+    assert set(model.state_dict().keys()) <= arch_keys
+    assert mtp_num_layers_key == "mtp_num_hidden_layers"
+    set_config_value(config, mtp_num_layers_key, 0)
+    assert config.mtp_num_hidden_layers == 0
+
+
+def test_qwen35_dense_passthrough_merge():
+    with tempfile.TemporaryDirectory() as a:
+        model_a, _ = _save_model(Qwen3_5ForConditionalGeneration, _dense_config(), a)
+        config = MergeConfiguration(
+            merge_method="passthrough",
+            models=[InputModelDefinition(model=model_a)],
+            dtype="bfloat16",
+        )
+        run_and_check_merge(config)
+
+
+def test_qwen35_moe_linear_merge():
+    with tempfile.TemporaryDirectory() as a, tempfile.TemporaryDirectory() as b:
+        model_a, _ = _save_model(Qwen3_5MoeForConditionalGeneration, _moe_config(), a)
+        model_b, _ = _save_model(Qwen3_5MoeForConditionalGeneration, _moe_config(), b)
+        config = MergeConfiguration(
+            merge_method="linear",
+            models=[
+                InputModelDefinition(model=model_a, parameters={"weight": 0.5}),
+                InputModelDefinition(model=model_b, parameters={"weight": 0.5}),
+            ],
+            dtype="bfloat16",
+        )
+        run_and_check_merge(config)

From 7862bb7aacb6b866965a112403f08c69ace289f0 Mon Sep 17 00:00:00 2001
From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com>
Date: Thu, 7 May 2026 15:24:14 +0800
Subject: [PATCH 2/4] Deduplicate Qwen3.5 MTP module registration

---
 mergekit/architecture/qwen35.py | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py
index a3952537..0f6c5975 100644
--- a/mergekit/architecture/qwen35.py
+++ b/mergekit/architecture/qwen35.py
@@ -322,23 +322,15 @@ def qwen35_architecture_for_config(config: PretrainedConfig) -> ModelArchitectur
         modules["vision_tower"] = ModuleDefinition(
             architecture=Qwen35VisionModuleArchitecture()
         )
-        if _cfg(config, "text_config.mtp_num_hidden_layers", 0):
-            modules["mtp"] = ModuleDefinition(
-                architecture=Qwen35MtpModuleArchitecture(
-                    is_moe=is_moe,
-                    num_experts=num_experts,
-                    num_layers_key=mtp_num_layers_key,
-                )
-            )
-    else:
-        if _cfg(config, "mtp_num_hidden_layers", 0):
-            modules["mtp"] = ModuleDefinition(
-                architecture=Qwen35MtpModuleArchitecture(
-                    is_moe=is_moe,
-                    num_experts=num_experts,
-                    num_layers_key=mtp_num_layers_key,
-                )
+
+    if _cfg(config, mtp_num_layers_key, 0):
+        modules["mtp"] = ModuleDefinition(
+            architecture=Qwen35MtpModuleArchitecture(
+                is_moe=is_moe,
+                num_experts=num_experts,
+                num_layers_key=mtp_num_layers_key,
             )
+        )
 
     return ModelArchitecture(
         modules=modules,

From 90c20dafed0fbf40aeb7b13db309e69df71a2f5a Mon Sep 17 00:00:00 2001
From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com>
Date: Thu, 7 May 2026 15:35:53 +0800
Subject: [PATCH 3/4] Apply pre-commit formatting

---
 mergekit/architecture/qwen35.py   | 1 -
 tests/test_qwen35_architecture.py | 8 ++------
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py
index 0f6c5975..5a412bff 100644
--- a/mergekit/architecture/qwen35.py
+++ b/mergekit/architecture/qwen35.py
@@ -14,7 +14,6 @@
 )
 from mergekit.common import get_config_value
 
-
 QWEN35_DENSE_ARCHITECTURES = {
     "Qwen3_5ForConditionalGeneration",
     "Qwen3_5ForCausalLM",
diff --git a/tests/test_qwen35_architecture.py b/tests/test_qwen35_architecture.py
index 96d4520c..60186cd8 100644
--- a/tests/test_qwen35_architecture.py
+++ b/tests/test_qwen35_architecture.py
@@ -154,9 +154,7 @@ def test_qwen35_dense_architecture_covers_transformers_keys():
         (_moe_config(), Qwen3_5MoeForConditionalGeneration),
     ],
 )
-def test_qwen35_full_attention_bias_covers_output_projection_bias(
-    config, model_cls
-):
+def test_qwen35_full_attention_bias_covers_output_projection_bias(config, model_cls):
     config.text_config.attention_bias = True
     model = model_cls(config)
     state_keys = set(model.state_dict().keys())
@@ -187,9 +185,7 @@ def test_qwen35_moe_architecture_covers_transformers_keys_and_mtp_experts():
         (_moe_text_config(), Qwen3_5MoeForCausalLM),
     ],
 )
-def test_qwen35_text_only_architecture_uses_top_level_mtp_config_key(
-    config, model_cls
-):
+def test_qwen35_text_only_architecture_uses_top_level_mtp_config_key(config, model_cls):
     model = model_cls(config)
     arch = arch_info_for_config(config)
     arch_keys = {weight.name for weight in arch.all_weights(config)}

From a812d01a66ec3eab6aa6cadc193e507051f89807 Mon Sep 17 00:00:00 2001
From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com>
Date: Thu, 7 May 2026 18:59:24 +0800
Subject: [PATCH 4/4] Support packed Qwen3.5 MTP experts

---
 mergekit/architecture/qwen35.py   | 7 +++++++
 tests/test_qwen35_architecture.py | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py
index 5a412bff..edde4f90 100644
--- a/mergekit/architecture/qwen35.py
+++ b/mergekit/architecture/qwen35.py
@@ -213,6 +213,13 @@ def layer_weights(
                 self.num_experts or getattr(_text_config(config), "num_experts", 0) or 0
             )
             res.append(WeightInfo(name=f"{prefix}.mlp.gate.weight", optional=True))
+            res.extend(
+                WeightInfo(name=f"{prefix}.mlp.{name}", optional=True)
+                for name in (
+                    "experts.gate_up_proj",
+                    "experts.down_proj",
+                )
+            )
             for expert_idx in range(num_experts):
                 for proj in ("gate_proj", "up_proj", "down_proj"):
                     res.append(
diff --git a/tests/test_qwen35_architecture.py b/tests/test_qwen35_architecture.py
index 60186cd8..2f43941f 100644
--- a/tests/test_qwen35_architecture.py
+++ b/tests/test_qwen35_architecture.py
@@ -174,6 +174,8 @@ def test_qwen35_moe_architecture_covers_transformers_keys_and_mtp_experts():
     assert "model.language_model.layers.0.mlp.experts.gate_up_proj" in arch_keys
     assert "model.language_model.layers.0.mlp.shared_expert_gate.weight" in arch_keys
     assert "model.language_model.layers.3.self_attn.q_proj.weight" in arch_keys
+    assert "mtp.layers.0.mlp.experts.gate_up_proj" in arch_keys
+    assert "mtp.layers.0.mlp.experts.down_proj" in arch_keys
     assert "mtp.layers.0.mlp.experts.3.down_proj.weight" in arch_keys
     assert "mtp.layers.0.mlp.shared_expert_gate.weight" in arch_keys