diff --git a/transformer_lens/BertNextSentencePrediction.py b/transformer_lens/BertNextSentencePrediction.py index eb38e6879..fb6f8e8fa 100644 --- a/transformer_lens/BertNextSentencePrediction.py +++ b/transformer_lens/BertNextSentencePrediction.py @@ -153,7 +153,7 @@ def forward( "[CLS] Sentence A [SEP] Sentence B [SEP]", token_type_ids would be [0, 0, ..., 0, 1, ..., 1, 1]. `0` represents tokens from Sentence A, `1` from Sentence B. If not provided, BERT assumes a single sequence input. - This parameter gets inferred from the the tokenizer if input is a string or list of strings. + This parameter gets inferred from the tokenizer if input is a string or list of strings. Shape is (batch_size, sequence_length). one_zero_attention_mask: Optional[torch.Tensor]: A binary mask which indicates which tokens should be attended to (1) and which should be ignored (0). diff --git a/transformer_lens/HookedEncoder.py b/transformer_lens/HookedEncoder.py index f71a9c75a..9dffc07e4 100644 --- a/transformer_lens/HookedEncoder.py +++ b/transformer_lens/HookedEncoder.py @@ -251,7 +251,7 @@ def forward( "[CLS] Sentence A [SEP] Sentence B [SEP]", token_type_ids would be [0, 0, ..., 0, 1, ..., 1, 1]. `0` represents tokens from Sentence A, `1` from Sentence B. If not provided, BERT assumes a single sequence input. - This parameter gets inferred from the the tokenizer if input is a string or list of strings. + This parameter gets inferred from the tokenizer if input is a string or list of strings. Shape is (batch_size, sequence_length). one_zero_attention_mask: Optional[torch.Tensor]: A binary mask which indicates which tokens should be attended to (1) and which should be ignored (0). diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index 9baf479e5..8370bbf8a 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -1509,7 +1509,7 @@ def init_weights(self): The default PyTorch scheme is the following: all linear layers use uniform(-1/sqrt(fan_in), 1/sqrt(fan_in)) for weights, and uniform(-1/sqrt(fan_in), 1/sqrt(fan_in)) for biases. For biases, fan_in is computed using the fan_in for the weight matrix of the linear layer. Note - tha it *does not actually* use Kaiming initialization, despite the fact that it calls the + that it *does not actually* use Kaiming initialization, despite the fact that it calls the function. However, for Transformer blocks, it instead initializes biases to zero and weights using Xavier uniform, that diff --git a/transformer_lens/components/abstract_attention.py b/transformer_lens/components/abstract_attention.py index f2dd0338b..b7a8cabbd 100644 --- a/transformer_lens/components/abstract_attention.py +++ b/transformer_lens/components/abstract_attention.py @@ -175,7 +175,7 @@ def __init__( self.register_buffer("rotary_sin", sin) self.register_buffer("rotary_cos", cos) elif self.cfg.positional_embedding_type == "alibi": - # ALiBi bias wil be constructed on the first forward pass. + # ALiBi bias will be constructed on the first forward pass. # Note: While computationally efficient, initializing an bias with max n_ctx (16, 1024, 1024) of float32 will occupy ~256MiB of contiguous GPU memory, which may not be optimal for memory usage. self.alibi = None diff --git a/transformer_lens/components/grouped_query_attention.py b/transformer_lens/components/grouped_query_attention.py index d9fbbf8a8..550542960 100644 --- a/transformer_lens/components/grouped_query_attention.py +++ b/transformer_lens/components/grouped_query_attention.py @@ -153,7 +153,7 @@ def calculate_attention_scores( k: Float[torch.Tensor, "batch key_pos kv_head_index d_head"], ) -> Float[torch.Tensor, "batch head_index query_pos key_pos"]: """Calculate attention scores from Q and the unexpanded K matrix. - K will be expaned from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave. + K will be expanded from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave. Args: q (Float[torch.Tensor, "batch query_pos head_index d_head"]): The Q tensor. @@ -172,7 +172,7 @@ def calculate_z_scores( pattern: Float[torch.Tensor, "batch head_index query_pos key_pos"], ) -> Float[torch.Tensor, "batch query_pos head_index d_head"]: """Calculate z scores from the attention pattern and the unexpanded V matrix. - V will be expaned from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave. + V will be expanded from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave. Args: v (Float[torch.Tensor, "batch query_pos head_index d_head"]): The V tensor. diff --git a/transformer_lens/components/t5_block.py b/transformer_lens/components/t5_block.py index 88d5467e0..e93b70e7f 100644 --- a/transformer_lens/components/t5_block.py +++ b/transformer_lens/components/t5_block.py @@ -16,7 +16,7 @@ class T5Block(nn.Module): """ - T5 decoder Block. Uses T5Layernorm, and T5attention insted of usual ones. + T5 decoder Block. Uses T5Layernorm, and T5attention instead of usual ones. Also uses cross attention if is_decoder is True. """ diff --git a/transformer_lens/head_detector.py b/transformer_lens/head_detector.py index 9efd237ff..e05ad6412 100644 --- a/transformer_lens/head_detector.py +++ b/transformer_lens/head_detector.py @@ -28,7 +28,7 @@ SEQ_LEN_ERR = "The sequence must be non-empty and must fit within the model's context window." -DET_PAT_NOT_SQUARE_ERR = "The detection pattern must be a lower triangular matrix of shape (sequence_length, sequence_length); sequence_length=%d; got detection patern of shape %s" +DET_PAT_NOT_SQUARE_ERR = "The detection pattern must be a lower triangular matrix of shape (sequence_length, sequence_length); sequence_length=%d; got detection pattern of shape %s" def detect_head( @@ -87,7 +87,7 @@ def detect_head( Currently available heads are: `["previous_token_head", "duplicate_token_head", "induction_head"]`. heads: If specific attention heads is given here, all other heads' score is set to -1. - Useful for IOI-style circuit analysis. Heads can be spacified as a list tuples (layer, + Useful for IOI-style circuit analysis. Heads can be specified as a list of tuples (layer, head) or a dictionary mapping a layer to heads within that layer that we want to analyze. cache: Include the cache to save time if you want. exclude_bos: Exclude attention paid to the beginning of sequence token. diff --git a/transformer_lens/model_bridge/bridge.py b/transformer_lens/model_bridge/bridge.py index 6d9228c96..3badb26f0 100644 --- a/transformer_lens/model_bridge/bridge.py +++ b/transformer_lens/model_bridge/bridge.py @@ -2094,7 +2094,7 @@ def cache_hook(tensor: torch.Tensor, *, hook: Any) -> torch.Tensor: try: if hasattr(tensor, "detach"): cache[name] = tensor.detach().to(cache_device) - except: + except Exception: pass return tensor diff --git a/transformer_lens/pretrained/weight_conversions/neox.py b/transformer_lens/pretrained/weight_conversions/neox.py index ff84b5b0d..70d8a6b48 100644 --- a/transformer_lens/pretrained/weight_conversions/neox.py +++ b/transformer_lens/pretrained/weight_conversions/neox.py @@ -14,7 +14,7 @@ def convert_neox_weights(neox, cfg: HookedTransformerConfig): state_dict[f"blocks.{l}.ln1.b"] = neox.gpt_neox.layers[l].input_layernorm.bias # For some inexplicable reason, NeoX both uses the concatenated QKV - # matmul of GPT-2 (afaict this has a neglible performance impact) AND + # matmul of GPT-2 (afaict this has a negligible performance impact) AND # has the flattened axis in the DIFFERENT order of (head_index qkv # d_head) - this took me an hour to debug... W = neox.gpt_neox.layers[l].attention.query_key_value.weight diff --git a/transformer_lens/utilities/defaults_utils.py b/transformer_lens/utilities/defaults_utils.py index 84826e395..7745ba8ac 100644 --- a/transformer_lens/utilities/defaults_utils.py +++ b/transformer_lens/utilities/defaults_utils.py @@ -33,7 +33,7 @@ class LocallyOverridenDefaults: WARNING: This context manager must be used for any function/method that directly accesses default values which may be overridden by the user using the function/method's arguments, e.g., `model.cfg.default_prepend_bos` and `model.tokenizer.padding_side` which can be - overriden by `prepend_bos` and `padding_side` arguments, respectively, in the `to_tokens`. + overridden by `prepend_bos` and `padding_side` arguments, respectively, in the `to_tokens`. """ def __init__(self, model, **overrides): diff --git a/transformer_lens/utilities/lm_utils.py b/transformer_lens/utilities/lm_utils.py index 2c0ffed43..a3d4f7932 100644 --- a/transformer_lens/utilities/lm_utils.py +++ b/transformer_lens/utilities/lm_utils.py @@ -1,6 +1,6 @@ """lm_utils. -This module contains utility functions related to langauge models +This module contains utility functions related to language models """ from __future__ import annotations diff --git a/transformer_lens/utilities/logits_utils.py b/transformer_lens/utilities/logits_utils.py index 2baacc22f..8d93331d4 100644 --- a/transformer_lens/utilities/logits_utils.py +++ b/transformer_lens/utilities/logits_utils.py @@ -117,7 +117,7 @@ def sample_logits( len(tokens.shape) == 2 ), "Frequency penalty do not support input in the form of embeddings" for batch_index in range(final_logits.shape[0]): - # torch.bincount returns a tensor of length d_vocab, with the number of occurences of each token in the tokens. + # torch.bincount returns a tensor of length d_vocab, with the number of occurrences of each token in the tokens. final_logits[batch_index] = final_logits[ batch_index ] - freq_penalty * torch.bincount( diff --git a/transformer_lens/utilities/slice.py b/transformer_lens/utilities/slice.py index f1c4b0245..98cba6404 100644 --- a/transformer_lens/utilities/slice.py +++ b/transformer_lens/utilities/slice.py @@ -1,6 +1,6 @@ """Slice. -This module contains the functionailty for the Slice object +This module contains the functionality for the Slice object """ from __future__ import annotations diff --git a/transformer_lens/utilities/tensors.py b/transformer_lens/utilities/tensors.py index 5e0971b13..f233fc97d 100644 --- a/transformer_lens/utilities/tensors.py +++ b/transformer_lens/utilities/tensors.py @@ -121,7 +121,7 @@ def get_offset_position_ids( """ Returns the indices of non-padded tokens, offset by the position of the first attended token. """ - # shift the position ids so that the id at the the first attended token position becomes zero. + # shift the position ids so that the id at the first attended token position becomes zero. # The position ids of the prepending pad tokens are shifted to -1. shifted_position_ids = attention_mask.cumsum(dim=1) - 1 # [batch, tokens_length]