From 46841da8c741a3cdace41dae1c7b2f4b264f46b2 Mon Sep 17 00:00:00 2001 From: Zhoutong Fu Date: Tue, 20 Jan 2026 01:37:54 +0000 Subject: [PATCH 1/2] add support for deepseek v3 --- mergekit/architecture/__init__.py | 8 ++ mergekit/architecture/deepseek_v3.py | 110 +++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) create mode 100644 mergekit/architecture/deepseek_v3.py diff --git a/mergekit/architecture/__init__.py b/mergekit/architecture/__init__.py index b6e27f57..2cf9aeb6 100644 --- a/mergekit/architecture/__init__.py +++ b/mergekit/architecture/__init__.py @@ -17,6 +17,7 @@ WeightInfo, ) from mergekit.architecture.json_definitions import NAME_TO_ARCH +from mergekit.architecture.deepseek_v3 import DeepseekV3HybridMoeModuleArchitecture from mergekit.architecture.moe_defs import ( AfmoeModuleArchitecture, MixtralModuleArchitecture, @@ -58,6 +59,13 @@ def arch_info_for_config(config: PretrainedConfig) -> Optional[ModelArchitecture architectures=[arch_name], model_type="afmoe", ) + elif arch_name == DeepseekV3HybridMoeModuleArchitecture.ARCHITECTURE_NAME: + module = DeepseekV3HybridMoeModuleArchitecture.from_config(config) + return ModelArchitecture( + modules={"default": ModuleDefinition(architecture=module)}, + architectures=[arch_name], + model_type="deepseek_v3", + ) elif arch_name in NAME_TO_ARCH: candidates = list(NAME_TO_ARCH[arch_name]) if len(candidates) == 1: diff --git a/mergekit/architecture/deepseek_v3.py b/mergekit/architecture/deepseek_v3.py new file mode 100644 index 00000000..0929fe27 --- /dev/null +++ b/mergekit/architecture/deepseek_v3.py @@ -0,0 +1,110 @@ +# Copyright (C) 2025 Arcee AI +# SPDX-License-Identifier: LGPL-3.0-only + +from typing import ClassVar, List, Optional + +from pydantic import BaseModel +from transformers import PretrainedConfig + +from mergekit.architecture.base import ModuleArchitecture, WeightInfo + + +class DeepseekV3HybridMoeModuleArchitecture(ModuleArchitecture, BaseModel): + """ + DeepSeek V3 uses a hybrid dense + MoE MLP: + - the first `first_k_dense_replace` layers are dense MLP weights: + mlp.{gate_proj,up_proj,down_proj}.weight + - the remaining layers are typically MoE (controlled by moe_layer_freq): + mlp.gate.weight (+ optional e_score_correction_bias), + mlp.shared_experts.{gate_proj,up_proj,down_proj}.weight, + mlp.experts.{i}.{gate_proj,up_proj,down_proj}.weight + """ + + ARCHITECTURE_NAME: ClassVar[str] = "DeepseekV3ForCausalLM" + + first_k_dense_replace: int = 0 + moe_layer_freq: int = 1 + n_routed_experts: int = 0 + n_shared_experts: int = 0 + + def name(self) -> str: + return "deepseek_v3" + + @classmethod + def from_config(cls, config: PretrainedConfig) -> "DeepseekV3HybridMoeModuleArchitecture": + return cls( + first_k_dense_replace=int(getattr(config, "first_k_dense_replace", 0) or 0), + moe_layer_freq=max(1, int(getattr(config, "moe_layer_freq", 1) or 1)), + n_routed_experts=int(getattr(config, "n_routed_experts", 0) or 0), + n_shared_experts=int(getattr(config, "n_shared_experts", 0) or 0), + ) + + def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + # Observed in DeepSeek V3 safetensors indices + return [WeightInfo(name="model.embed_tokens.weight", is_embed=True)] + + def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]: + # Observed in DeepSeek V3 safetensors indices + return [ + WeightInfo(name="model.norm.weight"), + WeightInfo(name="lm_head.weight", is_embed=True), + ] + + def _is_moe_layer(self, index: int) -> bool: + """ + DeepSeek V3 is dense for first_k_dense_replace layers, then MoE with frequency. + """ + if index < self.first_k_dense_replace: + return False + # after dense block, apply MoE every `moe_layer_freq` layers + return ((index - self.first_k_dense_replace) % self.moe_layer_freq) == 0 + + def layer_weights( + self, index: int, config: PretrainedConfig + ) -> Optional[List[WeightInfo]]: + prefix = f"model.layers.{index}" + res: List[WeightInfo] = [] + + # Attention weights (based on observed tensor names in DeepSeek V3 checkpoints) + for suffix in ( + ".self_attn.o_proj.weight", + ".self_attn.q_a_proj.weight", + ".self_attn.q_b_proj.weight", + ".self_attn.kv_a_proj_with_mqa.weight", + ".self_attn.kv_b_proj.weight", + ".self_attn.q_a_layernorm.weight", + ".self_attn.kv_a_layernorm.weight", + ".input_layernorm.weight", + ".post_attention_layernorm.weight", + ): + res.append(WeightInfo(name=prefix + suffix)) + + # Dense vs MoE MLP blocks + if not self._is_moe_layer(index): + for p in ("gate_proj", "up_proj", "down_proj"): + res.append(WeightInfo(name=prefix + f".mlp.{p}.weight")) + return res + + # MoE MLP weights + res.append(WeightInfo(name=prefix + ".mlp.gate.weight")) + # Not always present across variants; treat as optional for robustness. + res.append( + WeightInfo( + name=prefix + ".mlp.gate.e_score_correction_bias", + optional=True, + ) + ) + + if self.n_shared_experts and self.n_shared_experts > 0: + for p in ("gate_proj", "up_proj", "down_proj"): + res.append(WeightInfo(name=prefix + f".mlp.shared_experts.{p}.weight")) + + for expert_idx in range(self.n_routed_experts): + for p in ("gate_proj", "up_proj", "down_proj"): + res.append( + WeightInfo(name=prefix + f".mlp.experts.{expert_idx}.{p}.weight") + ) + + return res + + From efce27e03c1d232b54956cc7297128fff8f25ea7 Mon Sep 17 00:00:00 2001 From: Zhoutong Fu Date: Wed, 21 Jan 2026 01:15:47 +0000 Subject: [PATCH 2/2] pydantic change --- mergekit/graph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mergekit/graph.py b/mergekit/graph.py index d3c54aab..17bb12a0 100644 --- a/mergekit/graph.py +++ b/mergekit/graph.py @@ -20,7 +20,7 @@ ValueT = TypeVar("ValueT") -class Task(ABC, BaseModel, Generic[ValueT], frozen=True): +class Task(ABC, BaseModel, Generic[ValueT], frozen=True, arbitrary_types_allowed=True): """ Abstract base class representing a task in a computational graph.