TransformerLensOrg · RecreationalMath · Jun 12, 2026
diff --git a/transformer_lens/BertNextSentencePrediction.py b/transformer_lens/BertNextSentencePrediction.py
@@ -153,7 +153,7 @@ def forward(
                 "[CLS] Sentence A [SEP] Sentence B [SEP]", token_type_ids would be
                 [0, 0, ..., 0, 1, ..., 1, 1]. `0` represents tokens from Sentence A,
                 `1` from Sentence B. If not provided, BERT assumes a single sequence input.
-                This parameter gets inferred from the the tokenizer if input is a string or list of strings.
+                This parameter gets inferred from the tokenizer if input is a string or list of strings.
                 Shape is (batch_size, sequence_length).
             one_zero_attention_mask: Optional[torch.Tensor]: A binary mask which indicates
                 which tokens should be attended to (1) and which should be ignored (0).

diff --git a/transformer_lens/HookedEncoder.py b/transformer_lens/HookedEncoder.py
@@ -251,7 +251,7 @@ def forward(
                 "[CLS] Sentence A [SEP] Sentence B [SEP]", token_type_ids would be
                 [0, 0, ..., 0, 1, ..., 1, 1]. `0` represents tokens from Sentence A,
                 `1` from Sentence B. If not provided, BERT assumes a single sequence input.
-                This parameter gets inferred from the the tokenizer if input is a string or list of strings.
+                This parameter gets inferred from the tokenizer if input is a string or list of strings.
                 Shape is (batch_size, sequence_length).
             one_zero_attention_mask: Optional[torch.Tensor]: A binary mask which indicates
                 which tokens should be attended to (1) and which should be ignored (0).

diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py
@@ -1509,7 +1509,7 @@ def init_weights(self):
         The default PyTorch scheme is the following: all linear layers use uniform(-1/sqrt(fan_in),
         1/sqrt(fan_in)) for weights, and uniform(-1/sqrt(fan_in), 1/sqrt(fan_in)) for biases. For
         biases, fan_in is computed using the fan_in for the weight matrix of the linear layer. Note
-        tha it *does not actually* use Kaiming initialization, despite the fact that it calls the
+        that it *does not actually* use Kaiming initialization, despite the fact that it calls the
         function.
 
         However, for Transformer blocks, it instead initializes biases to zero and weights using Xavier uniform, that

diff --git a/transformer_lens/components/abstract_attention.py b/transformer_lens/components/abstract_attention.py
@@ -175,7 +175,7 @@ def __init__(
             self.register_buffer("rotary_sin", sin)
             self.register_buffer("rotary_cos", cos)
         elif self.cfg.positional_embedding_type == "alibi":
-            # ALiBi bias wil be constructed on the first forward pass.
+            # ALiBi bias will be constructed on the first forward pass.
             # Note: While computationally efficient, initializing an bias with max n_ctx (16, 1024, 1024) of float32 will occupy ~256MiB of contiguous GPU memory, which may not be optimal for memory usage.
             self.alibi = None
 

diff --git a/transformer_lens/components/grouped_query_attention.py b/transformer_lens/components/grouped_query_attention.py
@@ -153,7 +153,7 @@ def calculate_attention_scores(
         k: Float[torch.Tensor, "batch key_pos kv_head_index d_head"],
     ) -> Float[torch.Tensor, "batch head_index query_pos key_pos"]:
         """Calculate attention scores from Q and the unexpanded K matrix.
-        K will be expaned from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.
+        K will be expanded from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.
 
         Args:
         q (Float[torch.Tensor, "batch query_pos head_index d_head"]): The Q tensor.
@@ -172,7 +172,7 @@ def calculate_z_scores(
         pattern: Float[torch.Tensor, "batch head_index query_pos key_pos"],
     ) -> Float[torch.Tensor, "batch query_pos head_index d_head"]:
         """Calculate z scores from the attention pattern and the unexpanded V matrix.
-        V will be expaned from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.
+        V will be expanded from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.
 
         Args:
         v (Float[torch.Tensor, "batch query_pos head_index d_head"]): The V tensor.

diff --git a/transformer_lens/components/t5_block.py b/transformer_lens/components/t5_block.py
@@ -16,7 +16,7 @@
 
 class T5Block(nn.Module):
     """
-    T5 decoder Block. Uses T5Layernorm, and T5attention insted of usual ones.
+    T5 decoder Block. Uses T5Layernorm, and T5attention instead of usual ones.
     Also uses cross attention if is_decoder is True.
     """
 

diff --git a/transformer_lens/head_detector.py b/transformer_lens/head_detector.py
@@ -28,7 +28,7 @@
 
 SEQ_LEN_ERR = "The sequence must be non-empty and must fit within the model's context window."
 
-DET_PAT_NOT_SQUARE_ERR = "The detection pattern must be a lower triangular matrix of shape (sequence_length, sequence_length); sequence_length=%d; got detection patern of shape %s"
+DET_PAT_NOT_SQUARE_ERR = "The detection pattern must be a lower triangular matrix of shape (sequence_length, sequence_length); sequence_length=%d; got detection pattern of shape %s"
 
 
 def detect_head(
@@ -87,7 +87,7 @@ def detect_head(
             Currently available heads are: `["previous_token_head", "duplicate_token_head",
             "induction_head"]`.
         heads: If specific attention heads is given here, all other heads' score is set to -1.
-            Useful for IOI-style circuit analysis. Heads can be spacified as a list tuples (layer,
+            Useful for IOI-style circuit analysis. Heads can be specified as a list of tuples (layer,
             head) or a dictionary mapping a layer to heads within that layer that we want to
             analyze. cache: Include the cache to save time if you want.
         exclude_bos: Exclude attention paid to the beginning of sequence token.

diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py
@@ -2094,7 +2094,7 @@ def cache_hook(tensor: torch.Tensor, *, hook: Any) -> torch.Tensor:
                     try:
                         if hasattr(tensor, "detach"):
                             cache[name] = tensor.detach().to(cache_device)
-                    except:
+                    except Exception:
                         pass
                 return tensor
 

diff --git a/transformer_lens/pretrained/weight_conversions/neox.py b/transformer_lens/pretrained/weight_conversions/neox.py
@@ -14,7 +14,7 @@ def convert_neox_weights(neox, cfg: HookedTransformerConfig):
         state_dict[f"blocks.{l}.ln1.b"] = neox.gpt_neox.layers[l].input_layernorm.bias
 
         # For some inexplicable reason, NeoX both uses the concatenated QKV
-        # matmul of GPT-2 (afaict this has a neglible performance impact) AND
+        # matmul of GPT-2 (afaict this has a negligible performance impact) AND
         # has the flattened axis in the DIFFERENT order of (head_index qkv
         # d_head) - this took me an hour to debug...
         W = neox.gpt_neox.layers[l].attention.query_key_value.weight

diff --git a/transformer_lens/utilities/defaults_utils.py b/transformer_lens/utilities/defaults_utils.py
@@ -33,7 +33,7 @@ class LocallyOverridenDefaults:
     WARNING: This context manager must be used for any function/method that directly accesses
     default values which may be overridden by the user using the function/method's arguments,
     e.g., `model.cfg.default_prepend_bos` and `model.tokenizer.padding_side` which can be
-    overriden by `prepend_bos` and `padding_side` arguments, respectively, in the `to_tokens`.
+    overridden by `prepend_bos` and `padding_side` arguments, respectively, in the `to_tokens`.
     """
 
     def __init__(self, model, **overrides):

diff --git a/transformer_lens/utilities/lm_utils.py b/transformer_lens/utilities/lm_utils.py
@@ -1,6 +1,6 @@
 """lm_utils.
 
-This module contains utility functions related to langauge models
+This module contains utility functions related to language models
 """
 
 from __future__ import annotations

diff --git a/transformer_lens/utilities/logits_utils.py b/transformer_lens/utilities/logits_utils.py
@@ -117,7 +117,7 @@ def sample_logits(
                 len(tokens.shape) == 2
             ), "Frequency penalty do not support input in the form of embeddings"
             for batch_index in range(final_logits.shape[0]):
-                # torch.bincount returns a tensor of length d_vocab, with the number of occurences of each token in the tokens.
+                # torch.bincount returns a tensor of length d_vocab, with the number of occurrences of each token in the tokens.
                 final_logits[batch_index] = final_logits[
                     batch_index
                 ] - freq_penalty * torch.bincount(

diff --git a/transformer_lens/utilities/slice.py b/transformer_lens/utilities/slice.py
@@ -1,6 +1,6 @@
 """Slice.
 
-This module contains the functionailty for the Slice object
+This module contains the functionality for the Slice object
 """
 
 from __future__ import annotations

diff --git a/transformer_lens/utilities/tensors.py b/transformer_lens/utilities/tensors.py
@@ -121,7 +121,7 @@ def get_offset_position_ids(
     """
     Returns the indices of non-padded tokens, offset by the position of the first attended token.
     """
-    # shift the position ids so that the id at the the first attended token position becomes zero.
+    # shift the position ids so that the id at the first attended token position becomes zero.
     # The position ids of the prepending pad tokens are shifted to -1.
     shifted_position_ids = attention_mask.cumsum(dim=1) - 1  # [batch, tokens_length]