From fdc8d3c3262a05f18ce8213996daf17100b164a1 Mon Sep 17 00:00:00 2001 From: Nirbhai Date: Thu, 11 Jun 2026 05:20:08 +0530 Subject: [PATCH] Remove unused eps_attr config field Breaking: removes the public eps_attr constructor argument and the config.eps_attr attribute. The field was never read (its consumer was deleted when NormalizationBridge moved to direct HF delegation), so no model behavior changes, but it is an API removal. --- docs/source/_static/adapter-template.py | 4 ---- .../adapter_development/adapter-creation-guide.md | 2 -- .../content/adapter_development/adapter-specification.md | 1 - .../adapter_development/adapter-unit-test-guide.md | 2 +- docs/source/content/contributing.md | 2 +- docs/source/content/debugging_numerical_divergence.md | 1 - .../supported_architectures/test_baichuan_adapter.py | 3 --- .../supported_architectures/test_cohere_adapter.py | 4 ---- .../supported_architectures/test_gpt_bigcode_adapter.py | 4 ---- .../supported_architectures/test_gpt_oss_adapter.py | 9 --------- .../supported_architectures/test_internlm2_adapter.py | 3 --- .../supported_architectures/test_llava_adapter.py | 3 --- transformer_lens/config/transformer_bridge_config.py | 2 -- .../model_bridge/supported_architectures/AGENTS.md | 4 +--- .../model_bridge/supported_architectures/baichuan.py | 1 - .../model_bridge/supported_architectures/cohere.py | 2 -- .../model_bridge/supported_architectures/gpt_bigcode.py | 1 - .../model_bridge/supported_architectures/gpt_oss.py | 2 -- .../model_bridge/supported_architectures/granite.py | 1 - .../model_bridge/supported_architectures/internlm2.py | 1 - .../model_bridge/supported_architectures/llama.py | 2 -- .../model_bridge/supported_architectures/llava.py | 1 - 22 files changed, 3 insertions(+), 52 deletions(-) diff --git a/docs/source/_static/adapter-template.py b/docs/source/_static/adapter-template.py index 7b14928b0..61b40da00 100644 --- a/docs/source/_static/adapter-template.py +++ b/docs/source/_static/adapter-template.py @@ -64,10 +64,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.attn_only = False # True only for attention-only models (rare) self.cfg.uses_rms_norm = True # Should match normalization_type - # TODO: Set the epsilon attribute name used by this model's normalization - # Check the HF model's norm layer to find the correct attribute name - self.cfg.eps_attr = "variance_epsilon" # or "layer_norm_eps", "rms_norm_eps", etc. - # TODO: Handle GQA if applicable # If the model uses Grouped Query Attention (n_key_value_heads < n_heads): if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: diff --git a/docs/source/content/adapter_development/adapter-creation-guide.md b/docs/source/content/adapter_development/adapter-creation-guide.md index 2a39b3bb2..886ccad14 100644 --- a/docs/source/content/adapter_development/adapter-creation-guide.md +++ b/docs/source/content/adapter_development/adapter-creation-guide.md @@ -104,7 +104,6 @@ Set these on `self.cfg` in `__init__` *before* building the component mapping (t | `gated_mlp` | `bool` | MLP has gate projection (SwiGLU) | | `attn_only` | `bool` | Model has no MLP layers (rare) | | `uses_rms_norm` | `bool` | Should match `normalization_type == "RMS"` | -| `eps_attr` | `str` | HF attribute name for norm epsilon | For GQA models, also forward `n_key_value_heads`: @@ -284,7 +283,6 @@ Both must be clean. Don't paper over mypy errors with `# type: ignore` — fix t ## Common pitfalls -- **Wrong `eps_attr` name.** Models that look identical use different attribute names (`variance_epsilon`, `rms_norm_eps`, `eps`). Read the norm class. - **Forgetting `n_key_value_heads`.** Without it, GQA models silently reshape weights as if they were MHA — verification fails with cryptic shape errors. - **Missing registration.** Adapter exists but the factory can't find it. Update both `__init__.py` and `architecture_adapter_factory.py`. - **Skipping `setup_component_testing` for RoPE.** Rotary embeddings need to be wired through to each attention bridge or component testing produces nonsense. diff --git a/docs/source/content/adapter_development/adapter-specification.md b/docs/source/content/adapter_development/adapter-specification.md index 17798b6b6..f177a3926 100644 --- a/docs/source/content/adapter_development/adapter-specification.md +++ b/docs/source/content/adapter_development/adapter-specification.md @@ -44,7 +44,6 @@ Set these on `self.cfg` in `__init__` before building the component mapping: | `gated_mlp` | `bool` | Whether MLP uses gate projection | Llama=True, GPT2=False | | `attn_only` | `bool` | Whether model has no MLP layers | Usually False | | `uses_rms_norm` | `bool` | Redundant with normalization_type but needed | Match normalization_type | -| `eps_attr` | `str` | Attribute name for norm epsilon | `"variance_epsilon"`, `"layer_norm_eps"` | ### GQA (Grouped Query Attention) diff --git a/docs/source/content/adapter_development/adapter-unit-test-guide.md b/docs/source/content/adapter_development/adapter-unit-test-guide.md index 0113c11fa..767503879 100644 --- a/docs/source/content/adapter_development/adapter-unit-test-guide.md +++ b/docs/source/content/adapter_development/adapter-unit-test-guide.md @@ -22,7 +22,7 @@ Organize around the three things an adapter decides (config, component mapping, | Area | Worth asserting | Skip | | --- | --- | --- | | **Component mapping** | The HF module paths and bridge **types** for this arch — especially non-standard ones (`transformer.wte`, `model.tok_embeddings`, `out_proj`, `fc_in`, `EncDecAttention`); the distinctive bridge (`JointQKVAttentionBridge`, `ParallelBlockBridge`, `SymbolicBridge`, `MoEBridge`, `SigLIP`); the exact submodule **set** (e.g. attention has `q_norm`/`k_norm`, or block has no `ln2`). | — | -| **Config quirks** | Propagation that drives *behavior*: `n_key_value_heads` (GQA) through the adapter's own branch, custom `eps_attr` value, softcap / `logit_scale` coercion + `None`-fallback, `rmsnorm_uses_offset`, `parallel_attn_mlp`, `uses_combined_qkv`, `supports_fold_ln=False` when a fused projection forces it, multimodal/`gated_q_proj` flags. | A flag whose only effect is the literal you set (see "config-literal" below). | +| **Config quirks** | Propagation that drives *behavior*: `n_key_value_heads` (GQA) through the adapter's own branch, softcap / `logit_scale` coercion + `None`-fallback, `rmsnorm_uses_offset`, `parallel_attn_mlp`, `uses_combined_qkv`, `supports_fold_ln=False` when a fused projection forces it, multimodal/`gated_q_proj` flags. | A flag whose only effect is the literal you set (see "config-literal" below). | | **Weight conversions** | Logic the **adapter** implements: a fused-QKV split's numerical partition (which rows are Q vs K vs V — e.g. GPT-2 thirds, CodeGen's `[Q,V,K]` `mp_num` ordering, Baichuan/InternLM2 interleaved layouts), a manual LayerNorm fold (values folded, weight reset to ones, dtype preserved), the exact conversion **key set** (no stray norm/bias entries). | The einops rearrange itself (see "dependency test"). | | **Overrides** | Each branch of `setup_component_testing` / `preprocess_weights` / `prepare_model` / `prepare_loading` you wrote — the happy path *and* the defensive `hasattr`/`None` guards, the no-op-when-absent path, the rejection guard. | Overrides you didn't write. | | **Behavioral hook shapes** | Where the adapter's config drives reshaping: GQA `hook_k`/`hook_v` at `n_key_value_heads`, MQA single KV head, hybrid layers where attn hooks are **absent** on linear-attention layers. | Generic `(batch, seq, d_model)` output shape (it's the shared bridge's contract, not yours). | diff --git a/docs/source/content/contributing.md b/docs/source/content/contributing.md index fc3092ad8..be65037fb 100644 --- a/docs/source/content/contributing.md +++ b/docs/source/content/contributing.md @@ -285,7 +285,7 @@ Two test layers: ### Common adapter gotchas -- **HF raw config attributes are invisible to TL-side consumers unless explicitly propagated to `self.cfg`.** Walk the HF `config.json` and mirror any non-standard knobs (`final_logit_softcapping`, `attn_logit_softcapping`, `query_pre_attn_scalar`, `sliding_window`, `layer_types`, custom `eps_attr` names) onto `self.cfg` so weight processing and forward passes can see them. +- **HF raw config attributes are invisible to TL-side consumers unless explicitly propagated to `self.cfg`.** Walk the HF `config.json` and mirror any non-standard knobs (`final_logit_softcapping`, `attn_logit_softcapping`, `query_pre_attn_scalar`, `sliding_window`, `layer_types`) onto `self.cfg` so weight processing and forward passes can see them. - **Some config attrs need both surface-on-cfg AND fold-into-weight** via a `preprocess_weights()` override. The trigger: a numerical operation HF's forward applies natively must also be baked into the raw weights, or `bridge.enable_compatibility_mode()` (which calls `process_weights` on raw weights) produces wrong results. Concrete examples in-tree: Cohere `logit_scale` → `unembed.weight`; Gemma embedding scale (`√d_model`) → `embed.weight`. Skip the fold and Phase 3 / Phase 4 of `verify_models` will silently degrade. - **Tokenizer policy is per-model, not per-architecture.** Sibling models in the same family routinely differ — the chat-instruct variant may prepend BOS where the base does not, padding side can flip, EOS handling can differ. It's worth re-checking `default_prepend_bos`, padding side, and EOS handling against the specific target rather than copying them from a starter adapter. `tokenizer_config.json` is not always reliable on its own — some architectures (Cohere is a notable example) declare `add_bos_token=False` but HF's `__call__` prepends BOS anyway. The most reliable check is to invoke the tokenizer directly: diff --git a/docs/source/content/debugging_numerical_divergence.md b/docs/source/content/debugging_numerical_divergence.md index 01568a3ce..23f1c67a0 100644 --- a/docs/source/content/debugging_numerical_divergence.md +++ b/docs/source/content/debugging_numerical_divergence.md @@ -40,7 +40,6 @@ The first hop where they disagree localizes the bug. | Off by a constant scale in residual | Final-RMS-norm offset missing | `cfg.rmsnorm_uses_offset = True` + `ArithmeticTensorConversion(ADDITION, 1.0)` | | Logits flat / saturated at extremes | Missing logit softcap | `cfg.output_logits_soft_cap` from HF's `final_logit_softcapping` | | Attention pattern collapses to argmax | Missing attention-score softcap | `cfg.attn_scores_soft_cap` from HF's `attn_logit_softcapping` | -| Off by `eps` magnitudes in norm | Wrong RMSNorm eps attribute name | `cfg.eps_attr` (Llama uses `"variance_epsilon"`, most others use `"eps"`) | | First MLP off; gate matches | Forgot gated-MLP wiring | `GatedMLPBridge` with `{gate, in, out}` submodules — not `MLPBridge` | | Bias-related drift | Adapter assumes biases that don't exist (Llama / RMSNorm) | `ProcessWeights._safe_get_tensor` handles `None`; check the weight-processing conversions are bias-aware | | Drift only in compatibility mode | Hook semantic carve-out missing for post-norm or MLA | See [compatibility_mode.md](compatibility_mode.md) §"Hook semantic parity" | diff --git a/tests/unit/model_bridge/supported_architectures/test_baichuan_adapter.py b/tests/unit/model_bridge/supported_architectures/test_baichuan_adapter.py index 457367718..fadd43afb 100644 --- a/tests/unit/model_bridge/supported_architectures/test_baichuan_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_baichuan_adapter.py @@ -74,9 +74,6 @@ def _make_w_pack_component(d_model: int) -> Any: class TestBaichuanAdapterConfig: - def test_eps_attr(self, adapter: BaichuanArchitectureAdapter) -> None: - assert adapter.cfg.eps_attr == "variance_epsilon" - def test_supports_fold_ln_false(self, adapter: BaichuanArchitectureAdapter) -> None: assert adapter.supports_fold_ln is False diff --git a/tests/unit/model_bridge/supported_architectures/test_cohere_adapter.py b/tests/unit/model_bridge/supported_architectures/test_cohere_adapter.py index 87eaf8c9c..de8ff8673 100644 --- a/tests/unit/model_bridge/supported_architectures/test_cohere_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_cohere_adapter.py @@ -69,10 +69,6 @@ def test_uses_rms_norm_is_false(self, adapter: CohereArchitectureAdapter) -> Non # CohereLayerNorm subtracts the mean — NOT RMSNorm. assert adapter.cfg.uses_rms_norm is False - def test_eps_attr_is_variance_epsilon(self, adapter: CohereArchitectureAdapter) -> None: - # CohereLayerNorm stores epsilon as self.variance_epsilon. - assert adapter.cfg.eps_attr == "variance_epsilon" - def test_parallel_attn_mlp_is_true(self, adapter: CohereArchitectureAdapter) -> None: # Single input_layernorm; attn and MLP run in parallel on same normed input. assert adapter.cfg.parallel_attn_mlp is True diff --git a/tests/unit/model_bridge/supported_architectures/test_gpt_bigcode_adapter.py b/tests/unit/model_bridge/supported_architectures/test_gpt_bigcode_adapter.py index 572fa3e13..ce0e7bac8 100644 --- a/tests/unit/model_bridge/supported_architectures/test_gpt_bigcode_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_gpt_bigcode_adapter.py @@ -265,10 +265,6 @@ def test_only_qkvo_conversion_keys(self, adapter: GPTBigCodeArchitectureAdapter) def test_uses_rms_norm_false(self, adapter: GPTBigCodeArchitectureAdapter) -> None: assert adapter.cfg.uses_rms_norm is False - def test_eps_attr(self, adapter: GPTBigCodeArchitectureAdapter) -> None: - # GPT-2 family eps (not RMS variance_epsilon). - assert adapter.cfg.eps_attr == "layer_norm_epsilon" - # --------------------------------------------------------------------------- # MQAQKVConversionRule tests diff --git a/tests/unit/model_bridge/supported_architectures/test_gpt_oss_adapter.py b/tests/unit/model_bridge/supported_architectures/test_gpt_oss_adapter.py index a555774f4..56b62c59a 100644 --- a/tests/unit/model_bridge/supported_architectures/test_gpt_oss_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_gpt_oss_adapter.py @@ -158,15 +158,6 @@ def __init__(self, cfg: TransformerBridgeConfig) -> None: self.o_proj = nn.Linear(cfg.n_heads * cfg.d_head, cfg.d_model, bias=False) -class TestGPTOSSAdapterConfig: - """Adapter-owned config defaults that downstream bridge code relies on.""" - - def test_eps_attr_is_variance_epsilon(self, adapter: GPTOSSArchitectureAdapter) -> None: - """GPT-OSS uses HF's `variance_epsilon` attribute name on RMSNorm modules, - not the default `eps`. Downstream norm-folding reads this attribute.""" - assert adapter.cfg.eps_attr == "variance_epsilon" - - class TestGPTOSSWeightConversions: """GPT-OSS uses the standard QKVO weight conversions (no biases), with GQA head counts.""" diff --git a/tests/unit/model_bridge/supported_architectures/test_internlm2_adapter.py b/tests/unit/model_bridge/supported_architectures/test_internlm2_adapter.py index 059f2fc73..f3861eece 100644 --- a/tests/unit/model_bridge/supported_architectures/test_internlm2_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_internlm2_adapter.py @@ -94,9 +94,6 @@ def _fill_interleaved( class TestInternLM2AdapterConfig: """Adapter sets all required config attributes.""" - def test_eps_attr(self, adapter: InternLM2ArchitectureAdapter) -> None: - assert adapter.cfg.eps_attr == "variance_epsilon" - def test_supports_fold_ln_false(self, adapter: InternLM2ArchitectureAdapter) -> None: # fold_ln silently skips attn when wqkv is fused in bridge state dict. assert adapter.supports_fold_ln is False diff --git a/tests/unit/model_bridge/supported_architectures/test_llava_adapter.py b/tests/unit/model_bridge/supported_architectures/test_llava_adapter.py index 0344c0e76..d494b6f11 100644 --- a/tests/unit/model_bridge/supported_architectures/test_llava_adapter.py +++ b/tests/unit/model_bridge/supported_architectures/test_llava_adapter.py @@ -63,9 +63,6 @@ def adapter(self): def test_is_multimodal(self, adapter): assert adapter.cfg.is_multimodal is True - def test_eps_attr(self, adapter): - assert adapter.cfg.eps_attr == "variance_epsilon" - def test_vision_config_extracted(self, adapter): assert adapter.cfg.vision_hidden_size == 1024 assert adapter.cfg.vision_num_layers == 24 diff --git a/transformer_lens/config/transformer_bridge_config.py b/transformer_lens/config/transformer_bridge_config.py index 44ea7f055..fe8661162 100644 --- a/transformer_lens/config/transformer_bridge_config.py +++ b/transformer_lens/config/transformer_bridge_config.py @@ -83,7 +83,6 @@ def __init__( NTK_by_parts_low_freq_factor: float = 1.0, NTK_by_parts_high_freq_factor: float = 4.0, NTK_by_parts_factor: float = 8.0, - eps_attr: str = "eps", rmsnorm_uses_offset: bool = False, attn_implementation: Optional[str] = None, # Audio model configuration @@ -176,7 +175,6 @@ def __init__( self.NTK_by_parts_low_freq_factor = NTK_by_parts_low_freq_factor self.NTK_by_parts_high_freq_factor = NTK_by_parts_high_freq_factor self.NTK_by_parts_factor = NTK_by_parts_factor - self.eps_attr = eps_attr self.rmsnorm_uses_offset = rmsnorm_uses_offset self.attn_implementation = attn_implementation # Audio model configuration diff --git a/transformer_lens/model_bridge/supported_architectures/AGENTS.md b/transformer_lens/model_bridge/supported_architectures/AGENTS.md index 8d1bf047d..628e8b7f3 100644 --- a/transformer_lens/model_bridge/supported_architectures/AGENTS.md +++ b/transformer_lens/model_bridge/supported_architectures/AGENTS.md @@ -98,7 +98,6 @@ HF raw config attributes are invisible to TL-side consumers unless propagated to | `query_pre_attn_scalar` | `self.cfg.query_pre_attn_scalar` | Gemma2/3 — query scaling override | | `sliding_window` | `self.cfg.sliding_window` | Mistral, Qwen2, Gemma2 — local-attention layers | | `layer_types` | `self.cfg.layer_types` | Hybrid models with per-layer attention type lists | -| Non-standard RMSNorm eps key | `self.cfg.eps_attr = ""` | Llama uses `"variance_epsilon"` instead of `"eps"` | **Weight-fold attributes** (need BOTH surface-on-cfg AND fold-into-weight via `preprocess_weights` — see [the next section](#when-to-override-preprocess_weights)): @@ -238,7 +237,6 @@ Failure message names the missing set. (`INTENTIONAL_EXCLUDES` in the test handl | RoPE (rotary positional embeddings) | `llama.py`, `mistral.py`, `qwen2.py`+ | `RotaryEmbeddingBridge(name="model.rotary_emb")` + `cfg.positional_embedding_type = "rotary"` | | GQA / MQA (`n_key_value_heads < n_heads`) | `llama.py`, `mistral.py`, `falcon.py`, `cohere.py` | Set `cfg.n_key_value_heads`; pass `n_kv_heads=` to `_qkvo_weight_conversions()` | | RMSNorm with offset | `gemma1.py`, `gemma2.py`, `gemma3.py` | `cfg.rmsnorm_uses_offset = True` + `ArithmeticTensorConversion(ADDITION, 1.0)` | -| Custom RMSNorm eps attribute | `llama.py` | `cfg.eps_attr = "variance_epsilon"` (Llama uses this instead of `eps`) | | Standard LayerNorm | `gpt2.py`, `bloom.py` | `cfg.normalization_type = "LN"` | | Gated MLP (`gate_proj`, `up_proj`, `down_proj`) | `llama.py`, `mistral.py`, `gemma1.py`, `qwen2.py`+ | `GatedMLPBridge` with submodules `{gate, in, out}` | | Combined QKV (`c_attn`) | `gpt2.py`, `bloom.py` | `QKVSplitRearrangeConversion` to split + rearrange | @@ -324,7 +322,7 @@ class TestMyArchHookCompatibility: No weight load, no HF Hub access — synthetic cfg + structural assertions only. Runs in default `make unit-test`. -Add one test per architecture quirk (softcaps, RMSNorm offsets, sliding window, custom `eps_attr`, MoE routing). Gemma1's "must NOT override `setup_hook_compatibility`" is a good one-quirk-one-test example. +Add one test per architecture quirk (softcaps, RMSNorm offsets, sliding window, MoE routing). Gemma1's "must NOT override `setup_hook_compatibility`" is a good one-quirk-one-test example. ### 2. Integration parity test — `tests/integration/model_bridge/test__adapter.py` diff --git a/transformer_lens/model_bridge/supported_architectures/baichuan.py b/transformer_lens/model_bridge/supported_architectures/baichuan.py index a50fabc37..f78063a54 100644 --- a/transformer_lens/model_bridge/supported_architectures/baichuan.py +++ b/transformer_lens/model_bridge/supported_architectures/baichuan.py @@ -186,7 +186,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.gated_mlp = True self.cfg.attn_only = False self.cfg.uses_rms_norm = True - self.cfg.eps_attr = "variance_epsilon" # Fused W_pack prevents standard fold_ln from reaching Q/K/V separately. # preprocess_weights() handles it instead. diff --git a/transformer_lens/model_bridge/supported_architectures/cohere.py b/transformer_lens/model_bridge/supported_architectures/cohere.py index 5dcb74149..97e8c3301 100644 --- a/transformer_lens/model_bridge/supported_architectures/cohere.py +++ b/transformer_lens/model_bridge/supported_architectures/cohere.py @@ -53,10 +53,8 @@ def __init__(self, cfg: Any) -> None: # --- Normalization --- # CohereLayerNorm is true LayerNorm (subtracts mean), NOT RMSNorm. # uses_rms_norm=False tells NormalizationBridge to subtract the mean. - # eps_attr="variance_epsilon": CohereLayerNorm stores eps as self.variance_epsilon. self.cfg.normalization_type = "LN" self.cfg.uses_rms_norm = False - self.cfg.eps_attr = "variance_epsilon" self.cfg.final_rms = False # --- Position embeddings and MLP --- diff --git a/transformer_lens/model_bridge/supported_architectures/gpt_bigcode.py b/transformer_lens/model_bridge/supported_architectures/gpt_bigcode.py index 102c4f2c7..2f3d7aa2b 100644 --- a/transformer_lens/model_bridge/supported_architectures/gpt_bigcode.py +++ b/transformer_lens/model_bridge/supported_architectures/gpt_bigcode.py @@ -84,7 +84,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.gated_mlp = False self.cfg.attn_only = False self.cfg.uses_rms_norm = False - self.cfg.eps_attr = "layer_norm_epsilon" self.cfg.n_key_value_heads = 1 # MQA: always 1 KV head # Mirror GPT-2 combined-QKV flags diff --git a/transformer_lens/model_bridge/supported_architectures/gpt_oss.py b/transformer_lens/model_bridge/supported_architectures/gpt_oss.py index 2e32d277c..2a808b6f7 100644 --- a/transformer_lens/model_bridge/supported_architectures/gpt_oss.py +++ b/transformer_lens/model_bridge/supported_architectures/gpt_oss.py @@ -30,8 +30,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.normalization_type = "RMS" self.cfg.uses_rms_norm = True - # GPT-OSS uses 'variance_epsilon' instead of 'eps' for RMSNorm - self.cfg.eps_attr = "variance_epsilon" # GPT-OSS uses rotary position embeddings, not learned embeddings self.cfg.positional_embedding_type = "rotary" # GPT-OSS attention returns (output, attn_weights), not a 3-tuple diff --git a/transformer_lens/model_bridge/supported_architectures/granite.py b/transformer_lens/model_bridge/supported_architectures/granite.py index c46081b0b..37e00598b 100644 --- a/transformer_lens/model_bridge/supported_architectures/granite.py +++ b/transformer_lens/model_bridge/supported_architectures/granite.py @@ -52,7 +52,6 @@ def _setup_common_config(self, cfg: Any) -> None: self.cfg.attn_only = False self.cfg.uses_rms_norm = True self.cfg.default_prepend_bos = False - self.cfg.eps_attr = "variance_epsilon" self.default_config = { "d_model": cfg.d_model, diff --git a/transformer_lens/model_bridge/supported_architectures/internlm2.py b/transformer_lens/model_bridge/supported_architectures/internlm2.py index a5405e807..80c056a23 100644 --- a/transformer_lens/model_bridge/supported_architectures/internlm2.py +++ b/transformer_lens/model_bridge/supported_architectures/internlm2.py @@ -93,7 +93,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.gated_mlp = True self.cfg.attn_only = False self.cfg.uses_rms_norm = True - self.cfg.eps_attr = "variance_epsilon" # Standard fold_ln silently skips attention when wqkv is fused (see class docstring). # preprocess_weights() handles it instead — same approach as phi3.py. diff --git a/transformer_lens/model_bridge/supported_architectures/llama.py b/transformer_lens/model_bridge/supported_architectures/llama.py index 5e98ceb37..b7e731a39 100644 --- a/transformer_lens/model_bridge/supported_architectures/llama.py +++ b/transformer_lens/model_bridge/supported_architectures/llama.py @@ -63,8 +63,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.n_key_value_heads = cfg.n_key_value_heads self.cfg.uses_rms_norm = True - # Llama uses 'variance_epsilon' instead of 'eps' for RMSNorm - self.cfg.eps_attr = "variance_epsilon" self.weight_processing_conversions = { **self._qkvo_weight_conversions(), diff --git a/transformer_lens/model_bridge/supported_architectures/llava.py b/transformer_lens/model_bridge/supported_architectures/llava.py index 407ae72e4..4993ffdb5 100644 --- a/transformer_lens/model_bridge/supported_architectures/llava.py +++ b/transformer_lens/model_bridge/supported_architectures/llava.py @@ -62,7 +62,6 @@ def __init__(self, cfg: Any) -> None: self.cfg.attn_implementation = "eager" self.cfg.final_rms = True self.cfg.attn_only = False - self.cfg.eps_attr = "variance_epsilon" # GQA support if hasattr(cfg, "n_key_value_heads") and cfg.n_key_value_heads is not None: