Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions mergekit/architecture/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
WeightInfo,
)
from mergekit.architecture.json_definitions import NAME_TO_ARCH
from mergekit.architecture.deepseek_v3 import DeepseekV3HybridMoeModuleArchitecture
from mergekit.architecture.moe_defs import (
AfmoeModuleArchitecture,
MixtralModuleArchitecture,
Expand Down Expand Up @@ -58,6 +59,13 @@ def arch_info_for_config(config: PretrainedConfig) -> Optional[ModelArchitecture
architectures=[arch_name],
model_type="afmoe",
)
elif arch_name == DeepseekV3HybridMoeModuleArchitecture.ARCHITECTURE_NAME:
module = DeepseekV3HybridMoeModuleArchitecture.from_config(config)
return ModelArchitecture(
modules={"default": ModuleDefinition(architecture=module)},
architectures=[arch_name],
model_type="deepseek_v3",
)
elif arch_name in NAME_TO_ARCH:
candidates = list(NAME_TO_ARCH[arch_name])
if len(candidates) == 1:
Expand Down
110 changes: 110 additions & 0 deletions mergekit/architecture/deepseek_v3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# Copyright (C) 2025 Arcee AI
# SPDX-License-Identifier: LGPL-3.0-only

from typing import ClassVar, List, Optional

from pydantic import BaseModel
from transformers import PretrainedConfig

from mergekit.architecture.base import ModuleArchitecture, WeightInfo


class DeepseekV3HybridMoeModuleArchitecture(ModuleArchitecture, BaseModel):
"""
DeepSeek V3 uses a hybrid dense + MoE MLP:
- the first `first_k_dense_replace` layers are dense MLP weights:
mlp.{gate_proj,up_proj,down_proj}.weight
- the remaining layers are typically MoE (controlled by moe_layer_freq):
mlp.gate.weight (+ optional e_score_correction_bias),
mlp.shared_experts.{gate_proj,up_proj,down_proj}.weight,
mlp.experts.{i}.{gate_proj,up_proj,down_proj}.weight
"""

ARCHITECTURE_NAME: ClassVar[str] = "DeepseekV3ForCausalLM"

first_k_dense_replace: int = 0
moe_layer_freq: int = 1
n_routed_experts: int = 0
n_shared_experts: int = 0

def name(self) -> str:
return "deepseek_v3"

@classmethod
def from_config(cls, config: PretrainedConfig) -> "DeepseekV3HybridMoeModuleArchitecture":
return cls(
first_k_dense_replace=int(getattr(config, "first_k_dense_replace", 0) or 0),
moe_layer_freq=max(1, int(getattr(config, "moe_layer_freq", 1) or 1)),
n_routed_experts=int(getattr(config, "n_routed_experts", 0) or 0),
n_shared_experts=int(getattr(config, "n_shared_experts", 0) or 0),
)

def pre_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
# Observed in DeepSeek V3 safetensors indices
return [WeightInfo(name="model.embed_tokens.weight", is_embed=True)]

def post_weights(self, config: PretrainedConfig) -> List[WeightInfo]:
# Observed in DeepSeek V3 safetensors indices
return [
WeightInfo(name="model.norm.weight"),
WeightInfo(name="lm_head.weight", is_embed=True),
]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing optional and tied_names for lm_head.weight

High Severity

The lm_head.weight in post_weights is missing optional=True and tied_names=("model.embed_tokens.weight",). Many transformer models (including DeepSeek V3 variants) tie the input embeddings with the output LM head, storing only one copy as model.embed_tokens.weight. Without optional and tied_names, the weight loading will fail when lm_head.weight doesn't exist separately in the checkpoint, as the code has no fallback to look for the weight under its tied name. Other architecture definitions (LLaMA, Mistral) correctly handle this pattern.

Fix in Cursor Fix in Web


def _is_moe_layer(self, index: int) -> bool:
"""
DeepSeek V3 is dense for first_k_dense_replace layers, then MoE with frequency.
"""
if index < self.first_k_dense_replace:
return False
# after dense block, apply MoE every `moe_layer_freq` layers
return ((index - self.first_k_dense_replace) % self.moe_layer_freq) == 0

def layer_weights(
self, index: int, config: PretrainedConfig
) -> Optional[List[WeightInfo]]:
prefix = f"model.layers.{index}"
res: List[WeightInfo] = []

# Attention weights (based on observed tensor names in DeepSeek V3 checkpoints)
for suffix in (
".self_attn.o_proj.weight",
".self_attn.q_a_proj.weight",
".self_attn.q_b_proj.weight",
".self_attn.kv_a_proj_with_mqa.weight",
".self_attn.kv_b_proj.weight",
".self_attn.q_a_layernorm.weight",
".self_attn.kv_a_layernorm.weight",
".input_layernorm.weight",
".post_attention_layernorm.weight",
):
res.append(WeightInfo(name=prefix + suffix))

# Dense vs MoE MLP blocks
if not self._is_moe_layer(index):
for p in ("gate_proj", "up_proj", "down_proj"):
res.append(WeightInfo(name=prefix + f".mlp.{p}.weight"))
return res

# MoE MLP weights
res.append(WeightInfo(name=prefix + ".mlp.gate.weight"))
# Not always present across variants; treat as optional for robustness.
res.append(
WeightInfo(
name=prefix + ".mlp.gate.e_score_correction_bias",
optional=True,
)
)

if self.n_shared_experts and self.n_shared_experts > 0:
for p in ("gate_proj", "up_proj", "down_proj"):
res.append(WeightInfo(name=prefix + f".mlp.shared_experts.{p}.weight"))

for expert_idx in range(self.n_routed_experts):
for p in ("gate_proj", "up_proj", "down_proj"):
res.append(
WeightInfo(name=prefix + f".mlp.experts.{expert_idx}.{p}.weight")
)

return res


2 changes: 1 addition & 1 deletion mergekit/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
ValueT = TypeVar("ValueT")


class Task(ABC, BaseModel, Generic[ValueT], frozen=True):
class Task(ABC, BaseModel, Generic[ValueT], frozen=True, arbitrary_types_allowed=True):
"""
Abstract base class representing a task in a computational graph.

Expand Down
Loading