From 69f901f5f11e54c228ddedc85d38fd8b26836d1f Mon Sep 17 00:00:00 2001 From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com> Date: Thu, 7 May 2026 15:12:51 +0800 Subject: [PATCH 1/4] Add Qwen3.5 architecture support --- mergekit/architecture/__init__.py | 6 + mergekit/architecture/qwen35.py | 359 ++++++++++++++++++++++++++++++ tests/test_qwen35_architecture.py | 227 +++++++++++++++++++ 3 files changed, 592 insertions(+) create mode 100644 mergekit/architecture/qwen35.py create mode 100644 tests/test_qwen35_architecture.py diff --git a/mergekit/architecture/__init__.py b/mergekit/architecture/__init__.py index c731e58b..1b8de7cb 100644 --- a/mergekit/architecture/__init__.py +++ b/mergekit/architecture/__init__.py @@ -23,6 +23,10 @@ MixtralModuleArchitecture, Qwen3MoeModuleArchitecture, ) +from mergekit.architecture.qwen35 import ( + QWEN35_ARCHITECTURES, + qwen35_architecture_for_config, +) from mergekit.options import MergeOptions if TYPE_CHECKING: @@ -66,6 +70,8 @@ def arch_info_for_config(config: PretrainedConfig) -> Optional[ModelArchitecture architectures=[arch_name], model_type="glm4_moe", ) + elif arch_name in QWEN35_ARCHITECTURES: + return qwen35_architecture_for_config(config) elif arch_name in NAME_TO_ARCH: candidates = list(NAME_TO_ARCH[arch_name]) if len(candidates) == 1: diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py new file mode 100644 index 00000000..a3952537 --- /dev/null +++ b/mergekit/architecture/qwen35.py @@ -0,0 +1,359 @@ +# Copyright (C) 2025 Arcee AI +# SPDX-License-Identifier: LGPL-3.0-only + +from typing import List, Optional + +from pydantic import BaseModel +from transformers import PretrainedConfig + +from mergekit.architecture.base import ( + ModelArchitecture, + ModuleArchitecture, + ModuleDefinition, + WeightInfo, +) +from mergekit.common import get_config_value + + +QWEN35_DENSE_ARCHITECTURES = { + "Qwen3_5ForConditionalGeneration", + "Qwen3_5ForCausalLM", +} +QWEN35_MOE_ARCHITECTURES = { + "Qwen3_5MoeForConditionalGeneration", + "Qwen3_5MoeForCausalLM", +} +QWEN35_ARCHITECTURES = QWEN35_DENSE_ARCHITECTURES | QWEN35_MOE_ARCHITECTURES + + +def _text_config(config: PretrainedConfig): + return getattr(config, "text_config", config) + + +def _cfg(config: PretrainedConfig, key: str, default=None): + try: + return get_config_value(config, key) + except Exception: + return default + + +def _is_full_attention(config: PretrainedConfig, index: int) -> bool: + layer_types = getattr(_text_config(config), "layer_types", None) + if layer_types and index < len(layer_types): + return layer_types[index] == "full_attention" + # Qwen3.5 defaults to three linear-attention layers followed by one full-attention layer. + return index % 4 == 3 + + +class Qwen35LanguageModuleArchitecture(ModuleArchitecture, BaseModel, frozen=True): + """Text decoder for Qwen3.5 dense and MoE checkpoints. + + Official Qwen3.5 repos are image-text-to-text wrappers whose language weights live + under ``model.language_model``. Text-only exports use the usual ``model`` prefix. + """ + + root: str + is_moe: bool = False + num_experts: Optional[int] = None + + def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + return [WeightInfo(name=f"{self.root}.embed_tokens.weight", is_embed=True)] + + def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + return [ + WeightInfo(name=f"{self.root}.norm.weight"), + WeightInfo( + name="lm_head.weight", + is_embed=True, + optional=True, + tied_names=(f"{self.root}.embed_tokens.weight",), + ), + ] + + def num_layers_config_key(self) -> str: + return ( + "text_config.num_hidden_layers" + if self.root == "model.language_model" + else "num_hidden_layers" + ) + + def layer_weights( + self, index: int, config: PretrainedConfig + ) -> Optional[List[WeightInfo]]: + prefix = f"{self.root}.layers.{index}" + res = [WeightInfo(name=f"{prefix}.input_layernorm.weight")] + + if _is_full_attention(config, index): + res.extend( + WeightInfo(name=f"{prefix}.self_attn.{name}") + for name in ( + "q_proj.weight", + "k_proj.weight", + "v_proj.weight", + "o_proj.weight", + "q_norm.weight", + "k_norm.weight", + ) + ) + if getattr(_text_config(config), "attention_bias", False): + res.extend( + WeightInfo(name=f"{prefix}.self_attn.{name}", optional=True) + for name in ( + "q_proj.bias", + "k_proj.bias", + "v_proj.bias", + "o_proj.bias", + ) + ) + else: + res.extend( + WeightInfo(name=f"{prefix}.linear_attn.{name}") + for name in ( + "dt_bias", + "A_log", + "conv1d.weight", + "norm.weight", + "out_proj.weight", + "in_proj_qkv.weight", + "in_proj_z.weight", + "in_proj_b.weight", + "in_proj_a.weight", + ) + ) + + if self.is_moe: + res.append(WeightInfo(name=f"{prefix}.mlp.gate.weight")) + res.extend( + WeightInfo(name=f"{prefix}.mlp.{name}", optional=True) + for name in ( + "experts.gate_up_proj", + "experts.down_proj", + ) + ) + for expert_idx in range(self.num_experts or 0): + for proj in ("gate_proj", "up_proj", "down_proj"): + res.append( + WeightInfo( + name=f"{prefix}.mlp.experts.{expert_idx}.{proj}.weight", + optional=True, + ) + ) + if getattr(_text_config(config), "shared_expert_intermediate_size", None): + res.extend( + WeightInfo(name=f"{prefix}.mlp.{name}") + for name in ( + "shared_expert.gate_proj.weight", + "shared_expert.up_proj.weight", + "shared_expert.down_proj.weight", + "shared_expert_gate.weight", + ) + ) + else: + res.extend( + WeightInfo(name=f"{prefix}.mlp.{name}") + for name in ( + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + ) + ) + + res.append(WeightInfo(name=f"{prefix}.post_attention_layernorm.weight")) + return res + + +class Qwen35MtpModuleArchitecture(ModuleArchitecture, BaseModel, frozen=True): + is_moe: bool = False + num_experts: Optional[int] = None + num_layers_key: str = "text_config.mtp_num_hidden_layers" + + def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + return [ + WeightInfo(name="mtp.fc.weight", optional=True), + WeightInfo(name="mtp.norm.weight", optional=True), + WeightInfo(name="mtp.pre_fc_norm_embedding.weight", optional=True), + WeightInfo(name="mtp.pre_fc_norm_hidden.weight", optional=True), + ] + + def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + return [] + + def num_layers_config_key(self) -> Optional[str]: + return self.num_layers_key + + def num_layers(self, config: PretrainedConfig) -> int: + return int( + _cfg( + config, + self.num_layers_key, + _cfg( + config, + "text_config.mtp_num_hidden_layers", + _cfg(config, "mtp_num_hidden_layers", 0), + ), + ) + or 0 + ) + + def layer_weights( + self, index: int, config: PretrainedConfig + ) -> Optional[List[WeightInfo]]: + prefix = f"mtp.layers.{index}" + res = [ + WeightInfo(name=f"{prefix}.input_layernorm.weight", optional=True), + WeightInfo(name=f"{prefix}.self_attn.q_proj.weight", optional=True), + WeightInfo(name=f"{prefix}.self_attn.k_proj.weight", optional=True), + WeightInfo(name=f"{prefix}.self_attn.v_proj.weight", optional=True), + WeightInfo(name=f"{prefix}.self_attn.o_proj.weight", optional=True), + WeightInfo(name=f"{prefix}.self_attn.q_norm.weight", optional=True), + WeightInfo(name=f"{prefix}.self_attn.k_norm.weight", optional=True), + ] + + if self.is_moe: + num_experts = int( + self.num_experts or getattr(_text_config(config), "num_experts", 0) or 0 + ) + res.append(WeightInfo(name=f"{prefix}.mlp.gate.weight", optional=True)) + for expert_idx in range(num_experts): + for proj in ("gate_proj", "up_proj", "down_proj"): + res.append( + WeightInfo( + name=f"{prefix}.mlp.experts.{expert_idx}.{proj}.weight", + optional=True, + ) + ) + if getattr(_text_config(config), "shared_expert_intermediate_size", None): + res.extend( + WeightInfo(name=f"{prefix}.mlp.{name}", optional=True) + for name in ( + "shared_expert.gate_proj.weight", + "shared_expert.up_proj.weight", + "shared_expert.down_proj.weight", + "shared_expert_gate.weight", + ) + ) + else: + res.extend( + WeightInfo(name=f"{prefix}.mlp.{name}", optional=True) + for name in ( + "gate_proj.weight", + "up_proj.weight", + "down_proj.weight", + ) + ) + + res.append( + WeightInfo(name=f"{prefix}.post_attention_layernorm.weight", optional=True) + ) + return res + + +class Qwen35VisionModuleArchitecture(ModuleArchitecture, BaseModel, frozen=True): + def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + return [ + WeightInfo(name="model.visual.patch_embed.proj.weight", is_embed=True), + WeightInfo(name="model.visual.patch_embed.proj.bias", optional=True), + WeightInfo(name="model.visual.pos_embed.weight", is_embed=True), + WeightInfo(name="model.visual.merger.norm.weight"), + WeightInfo(name="model.visual.merger.norm.bias", optional=True), + WeightInfo(name="model.visual.merger.linear_fc1.weight"), + WeightInfo(name="model.visual.merger.linear_fc1.bias", optional=True), + WeightInfo(name="model.visual.merger.linear_fc2.weight"), + WeightInfo(name="model.visual.merger.linear_fc2.bias", optional=True), + ] + + def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + return [] + + def num_layers_config_key(self) -> str: + return "vision_config.depth" + + def layer_weights( + self, index: int, config: PretrainedConfig + ) -> Optional[List[WeightInfo]]: + prefix = f"model.visual.blocks.{index}" + return [ + WeightInfo(name=f"{prefix}.norm1.weight"), + WeightInfo(name=f"{prefix}.norm1.bias", optional=True), + WeightInfo(name=f"{prefix}.norm2.weight"), + WeightInfo(name=f"{prefix}.norm2.bias", optional=True), + WeightInfo(name=f"{prefix}.attn.qkv.weight"), + WeightInfo(name=f"{prefix}.attn.qkv.bias", optional=True), + WeightInfo(name=f"{prefix}.attn.proj.weight"), + WeightInfo(name=f"{prefix}.attn.proj.bias", optional=True), + WeightInfo(name=f"{prefix}.mlp.linear_fc1.weight"), + WeightInfo(name=f"{prefix}.mlp.linear_fc1.bias", optional=True), + WeightInfo(name=f"{prefix}.mlp.linear_fc2.weight"), + WeightInfo(name=f"{prefix}.mlp.linear_fc2.bias", optional=True), + ] + + +def qwen35_architecture_for_config(config: PretrainedConfig) -> ModelArchitecture: + arch_name = ( + config.architectures[0] if getattr(config, "architectures", None) else "" + ) + is_moe = arch_name in QWEN35_MOE_ARCHITECTURES or config.model_type in { + "qwen3_5_moe", + "qwen3_5_moe_text", + } + num_experts = ( + int(getattr(_text_config(config), "num_experts", 0) or 0) if is_moe else None + ) + is_multimodal_wrapper = arch_name.endswith("ForConditionalGeneration") and hasattr( + config, "vision_config" + ) + root = "model.language_model" if is_multimodal_wrapper else "model" + mtp_num_layers_key = ( + "text_config.mtp_num_hidden_layers" + if is_multimodal_wrapper + else "mtp_num_hidden_layers" + ) + + modules = { + "text_decoder" if is_multimodal_wrapper else "default": ModuleDefinition( + architecture=Qwen35LanguageModuleArchitecture( + root=root, + is_moe=is_moe, + num_experts=num_experts, + ) + ) + } + if is_multimodal_wrapper: + modules["vision_tower"] = ModuleDefinition( + architecture=Qwen35VisionModuleArchitecture() + ) + if _cfg(config, "text_config.mtp_num_hidden_layers", 0): + modules["mtp"] = ModuleDefinition( + architecture=Qwen35MtpModuleArchitecture( + is_moe=is_moe, + num_experts=num_experts, + num_layers_key=mtp_num_layers_key, + ) + ) + else: + if _cfg(config, "mtp_num_hidden_layers", 0): + modules["mtp"] = ModuleDefinition( + architecture=Qwen35MtpModuleArchitecture( + is_moe=is_moe, + num_experts=num_experts, + num_layers_key=mtp_num_layers_key, + ) + ) + + return ModelArchitecture( + modules=modules, + architectures=[arch_name] if arch_name else [], + model_type=config.model_type, + tagalong_files=( + [ + "preprocessor_config.json", + "video_preprocessor_config.json", + "vocab.json", + ] + if is_multimodal_wrapper + else None + ), + vocab_size_config_key=( + "text_config.vocab_size" if is_multimodal_wrapper else "vocab_size" + ), + ) diff --git a/tests/test_qwen35_architecture.py b/tests/test_qwen35_architecture.py new file mode 100644 index 00000000..96d4520c --- /dev/null +++ b/tests/test_qwen35_architecture.py @@ -0,0 +1,227 @@ +import tempfile + +import pytest + +from mergekit.architecture import arch_info_for_config +from mergekit.common import set_config_value +from mergekit.config import InputModelDefinition, MergeConfiguration +from tests.common import run_and_check_merge + +qwen35 = pytest.importorskip("transformers.models.qwen3_5") +qwen35_moe = pytest.importorskip("transformers.models.qwen3_5_moe") + +from transformers.models.qwen3_5.configuration_qwen3_5 import ( # noqa: E402 + Qwen3_5Config, + Qwen3_5TextConfig, + Qwen3_5VisionConfig, +) +from transformers.models.qwen3_5.modeling_qwen3_5 import ( # noqa: E402 + Qwen3_5ForCausalLM, + Qwen3_5ForConditionalGeneration, +) +from transformers.models.qwen3_5_moe.configuration_qwen3_5_moe import ( # noqa: E402 + Qwen3_5MoeConfig, + Qwen3_5MoeTextConfig, + Qwen3_5MoeVisionConfig, +) +from transformers.models.qwen3_5_moe.modeling_qwen3_5_moe import ( # noqa: E402 + Qwen3_5MoeForCausalLM, + Qwen3_5MoeForConditionalGeneration, +) + + +def _dense_config(): + text = Qwen3_5TextConfig( + vocab_size=64, + hidden_size=32, + intermediate_size=48, + num_hidden_layers=4, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=8, + linear_key_head_dim=8, + linear_value_head_dim=8, + linear_num_key_heads=4, + linear_num_value_heads=4, + layer_types=[ + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + ], + mtp_num_hidden_layers=1, + mtp_use_dedicated_embeddings=False, + tie_word_embeddings=True, + ) + vision = Qwen3_5VisionConfig( + depth=2, + hidden_size=16, + intermediate_size=32, + num_heads=2, + out_hidden_size=32, + num_position_embeddings=16, + ) + return Qwen3_5Config( + architectures=["Qwen3_5ForConditionalGeneration"], + text_config=text, + vision_config=vision, + tie_word_embeddings=True, + ) + + +def _moe_config(): + text = Qwen3_5MoeTextConfig( + vocab_size=64, + hidden_size=32, + num_hidden_layers=4, + num_attention_heads=4, + num_key_value_heads=2, + head_dim=8, + linear_key_head_dim=8, + linear_value_head_dim=8, + linear_num_key_heads=4, + linear_num_value_heads=4, + moe_intermediate_size=8, + shared_expert_intermediate_size=8, + num_experts=4, + num_experts_per_tok=2, + layer_types=[ + "linear_attention", + "linear_attention", + "linear_attention", + "full_attention", + ], + mtp_num_hidden_layers=1, + mtp_use_dedicated_embeddings=False, + tie_word_embeddings=True, + ) + vision = Qwen3_5MoeVisionConfig( + depth=2, + hidden_size=16, + intermediate_size=32, + num_heads=2, + out_hidden_size=32, + num_position_embeddings=16, + ) + return Qwen3_5MoeConfig( + architectures=["Qwen3_5MoeForConditionalGeneration"], + text_config=text, + vision_config=vision, + tie_word_embeddings=True, + ) + + +def _dense_text_config(): + config = _dense_config().text_config + config.architectures = ["Qwen3_5ForCausalLM"] + return config + + +def _moe_text_config(): + config = _moe_config().text_config + config.architectures = ["Qwen3_5MoeForCausalLM"] + return config + + +def _save_model(model_cls, config, path): + model = model_cls(config) + model.save_pretrained(path, safe_serialization=True) + return str(path), set(model.state_dict().keys()) + + +def _arch_names(config): + arch = arch_info_for_config(config) + return {weight.name for weight in arch.all_weights(config)} + + +def test_qwen35_dense_architecture_covers_transformers_keys(): + config = _dense_config() + model = Qwen3_5ForConditionalGeneration(config) + state_keys = set(model.state_dict().keys()) + arch_keys = _arch_names(config) + + assert state_keys <= arch_keys + assert "model.language_model.layers.0.linear_attn.in_proj_qkv.weight" in arch_keys + assert "model.language_model.layers.3.self_attn.q_proj.weight" in arch_keys + assert "mtp.layers.0.mlp.gate_proj.weight" in arch_keys + assert "mtp.fc.weight" in arch_keys + + +@pytest.mark.parametrize( + ("config", "model_cls"), + [ + (_dense_config(), Qwen3_5ForConditionalGeneration), + (_moe_config(), Qwen3_5MoeForConditionalGeneration), + ], +) +def test_qwen35_full_attention_bias_covers_output_projection_bias( + config, model_cls +): + config.text_config.attention_bias = True + model = model_cls(config) + state_keys = set(model.state_dict().keys()) + arch_keys = _arch_names(config) + + assert "model.language_model.layers.3.self_attn.o_proj.bias" in state_keys + assert state_keys <= arch_keys + + +def test_qwen35_moe_architecture_covers_transformers_keys_and_mtp_experts(): + config = _moe_config() + model = Qwen3_5MoeForConditionalGeneration(config) + state_keys = set(model.state_dict().keys()) + arch_keys = _arch_names(config) + + assert state_keys <= arch_keys + assert "model.language_model.layers.0.mlp.experts.gate_up_proj" in arch_keys + assert "model.language_model.layers.0.mlp.shared_expert_gate.weight" in arch_keys + assert "model.language_model.layers.3.self_attn.q_proj.weight" in arch_keys + assert "mtp.layers.0.mlp.experts.3.down_proj.weight" in arch_keys + assert "mtp.layers.0.mlp.shared_expert_gate.weight" in arch_keys + + +@pytest.mark.parametrize( + ("config", "model_cls"), + [ + (_dense_text_config(), Qwen3_5ForCausalLM), + (_moe_text_config(), Qwen3_5MoeForCausalLM), + ], +) +def test_qwen35_text_only_architecture_uses_top_level_mtp_config_key( + config, model_cls +): + model = model_cls(config) + arch = arch_info_for_config(config) + arch_keys = {weight.name for weight in arch.all_weights(config)} + mtp_num_layers_key = arch.modules["mtp"].architecture.num_layers_config_key() + + assert set(model.state_dict().keys()) <= arch_keys + assert mtp_num_layers_key == "mtp_num_hidden_layers" + set_config_value(config, mtp_num_layers_key, 0) + assert config.mtp_num_hidden_layers == 0 + + +def test_qwen35_dense_passthrough_merge(): + with tempfile.TemporaryDirectory() as a: + model_a, _ = _save_model(Qwen3_5ForConditionalGeneration, _dense_config(), a) + config = MergeConfiguration( + merge_method="passthrough", + models=[InputModelDefinition(model=model_a)], + dtype="bfloat16", + ) + run_and_check_merge(config) + + +def test_qwen35_moe_linear_merge(): + with tempfile.TemporaryDirectory() as a, tempfile.TemporaryDirectory() as b: + model_a, _ = _save_model(Qwen3_5MoeForConditionalGeneration, _moe_config(), a) + model_b, _ = _save_model(Qwen3_5MoeForConditionalGeneration, _moe_config(), b) + config = MergeConfiguration( + merge_method="linear", + models=[ + InputModelDefinition(model=model_a, parameters={"weight": 0.5}), + InputModelDefinition(model=model_b, parameters={"weight": 0.5}), + ], + dtype="bfloat16", + ) + run_and_check_merge(config) From 7862bb7aacb6b866965a112403f08c69ace289f0 Mon Sep 17 00:00:00 2001 From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com> Date: Thu, 7 May 2026 15:24:14 +0800 Subject: [PATCH 2/4] Deduplicate Qwen3.5 MTP module registration --- mergekit/architecture/qwen35.py | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py index a3952537..0f6c5975 100644 --- a/mergekit/architecture/qwen35.py +++ b/mergekit/architecture/qwen35.py @@ -322,23 +322,15 @@ def qwen35_architecture_for_config(config: PretrainedConfig) -> ModelArchitectur modules["vision_tower"] = ModuleDefinition( architecture=Qwen35VisionModuleArchitecture() ) - if _cfg(config, "text_config.mtp_num_hidden_layers", 0): - modules["mtp"] = ModuleDefinition( - architecture=Qwen35MtpModuleArchitecture( - is_moe=is_moe, - num_experts=num_experts, - num_layers_key=mtp_num_layers_key, - ) - ) - else: - if _cfg(config, "mtp_num_hidden_layers", 0): - modules["mtp"] = ModuleDefinition( - architecture=Qwen35MtpModuleArchitecture( - is_moe=is_moe, - num_experts=num_experts, - num_layers_key=mtp_num_layers_key, - ) + + if _cfg(config, mtp_num_layers_key, 0): + modules["mtp"] = ModuleDefinition( + architecture=Qwen35MtpModuleArchitecture( + is_moe=is_moe, + num_experts=num_experts, + num_layers_key=mtp_num_layers_key, ) + ) return ModelArchitecture( modules=modules, From 90c20dafed0fbf40aeb7b13db309e69df71a2f5a Mon Sep 17 00:00:00 2001 From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com> Date: Thu, 7 May 2026 15:35:53 +0800 Subject: [PATCH 3/4] Apply pre-commit formatting --- mergekit/architecture/qwen35.py | 1 - tests/test_qwen35_architecture.py | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py index 0f6c5975..5a412bff 100644 --- a/mergekit/architecture/qwen35.py +++ b/mergekit/architecture/qwen35.py @@ -14,7 +14,6 @@ ) from mergekit.common import get_config_value - QWEN35_DENSE_ARCHITECTURES = { "Qwen3_5ForConditionalGeneration", "Qwen3_5ForCausalLM", diff --git a/tests/test_qwen35_architecture.py b/tests/test_qwen35_architecture.py index 96d4520c..60186cd8 100644 --- a/tests/test_qwen35_architecture.py +++ b/tests/test_qwen35_architecture.py @@ -154,9 +154,7 @@ def test_qwen35_dense_architecture_covers_transformers_keys(): (_moe_config(), Qwen3_5MoeForConditionalGeneration), ], ) -def test_qwen35_full_attention_bias_covers_output_projection_bias( - config, model_cls -): +def test_qwen35_full_attention_bias_covers_output_projection_bias(config, model_cls): config.text_config.attention_bias = True model = model_cls(config) state_keys = set(model.state_dict().keys()) @@ -187,9 +185,7 @@ def test_qwen35_moe_architecture_covers_transformers_keys_and_mtp_experts(): (_moe_text_config(), Qwen3_5MoeForCausalLM), ], ) -def test_qwen35_text_only_architecture_uses_top_level_mtp_config_key( - config, model_cls -): +def test_qwen35_text_only_architecture_uses_top_level_mtp_config_key(config, model_cls): model = model_cls(config) arch = arch_info_for_config(config) arch_keys = {weight.name for weight in arch.all_weights(config)} From a812d01a66ec3eab6aa6cadc193e507051f89807 Mon Sep 17 00:00:00 2001 From: ZhangYiqun018 <20732979+ZhangYiqun018@users.noreply.github.com> Date: Thu, 7 May 2026 18:59:24 +0800 Subject: [PATCH 4/4] Support packed Qwen3.5 MTP experts --- mergekit/architecture/qwen35.py | 7 +++++++ tests/test_qwen35_architecture.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/mergekit/architecture/qwen35.py b/mergekit/architecture/qwen35.py index 5a412bff..edde4f90 100644 --- a/mergekit/architecture/qwen35.py +++ b/mergekit/architecture/qwen35.py @@ -213,6 +213,13 @@ def layer_weights( self.num_experts or getattr(_text_config(config), "num_experts", 0) or 0 ) res.append(WeightInfo(name=f"{prefix}.mlp.gate.weight", optional=True)) + res.extend( + WeightInfo(name=f"{prefix}.mlp.{name}", optional=True) + for name in ( + "experts.gate_up_proj", + "experts.down_proj", + ) + ) for expert_idx in range(num_experts): for proj in ("gate_proj", "up_proj", "down_proj"): res.append( diff --git a/tests/test_qwen35_architecture.py b/tests/test_qwen35_architecture.py index 60186cd8..2f43941f 100644 --- a/tests/test_qwen35_architecture.py +++ b/tests/test_qwen35_architecture.py @@ -174,6 +174,8 @@ def test_qwen35_moe_architecture_covers_transformers_keys_and_mtp_experts(): assert "model.language_model.layers.0.mlp.experts.gate_up_proj" in arch_keys assert "model.language_model.layers.0.mlp.shared_expert_gate.weight" in arch_keys assert "model.language_model.layers.3.self_attn.q_proj.weight" in arch_keys + assert "mtp.layers.0.mlp.experts.gate_up_proj" in arch_keys + assert "mtp.layers.0.mlp.experts.down_proj" in arch_keys assert "mtp.layers.0.mlp.experts.3.down_proj.weight" in arch_keys assert "mtp.layers.0.mlp.shared_expert_gate.weight" in arch_keys