Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion transformer_lens/BertNextSentencePrediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def forward(
"[CLS] Sentence A [SEP] Sentence B [SEP]", token_type_ids would be
[0, 0, ..., 0, 1, ..., 1, 1]. `0` represents tokens from Sentence A,
`1` from Sentence B. If not provided, BERT assumes a single sequence input.
This parameter gets inferred from the the tokenizer if input is a string or list of strings.
This parameter gets inferred from the tokenizer if input is a string or list of strings.
Shape is (batch_size, sequence_length).
one_zero_attention_mask: Optional[torch.Tensor]: A binary mask which indicates
which tokens should be attended to (1) and which should be ignored (0).
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/HookedEncoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def forward(
"[CLS] Sentence A [SEP] Sentence B [SEP]", token_type_ids would be
[0, 0, ..., 0, 1, ..., 1, 1]. `0` represents tokens from Sentence A,
`1` from Sentence B. If not provided, BERT assumes a single sequence input.
This parameter gets inferred from the the tokenizer if input is a string or list of strings.
This parameter gets inferred from the tokenizer if input is a string or list of strings.
Shape is (batch_size, sequence_length).
one_zero_attention_mask: Optional[torch.Tensor]: A binary mask which indicates
which tokens should be attended to (1) and which should be ignored (0).
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/HookedTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1509,7 +1509,7 @@ def init_weights(self):
The default PyTorch scheme is the following: all linear layers use uniform(-1/sqrt(fan_in),
1/sqrt(fan_in)) for weights, and uniform(-1/sqrt(fan_in), 1/sqrt(fan_in)) for biases. For
biases, fan_in is computed using the fan_in for the weight matrix of the linear layer. Note
tha it *does not actually* use Kaiming initialization, despite the fact that it calls the
that it *does not actually* use Kaiming initialization, despite the fact that it calls the
function.

However, for Transformer blocks, it instead initializes biases to zero and weights using Xavier uniform, that
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/components/abstract_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def __init__(
self.register_buffer("rotary_sin", sin)
self.register_buffer("rotary_cos", cos)
elif self.cfg.positional_embedding_type == "alibi":
# ALiBi bias wil be constructed on the first forward pass.
# ALiBi bias will be constructed on the first forward pass.
# Note: While computationally efficient, initializing an bias with max n_ctx (16, 1024, 1024) of float32 will occupy ~256MiB of contiguous GPU memory, which may not be optimal for memory usage.
self.alibi = None

Expand Down
4 changes: 2 additions & 2 deletions transformer_lens/components/grouped_query_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ def calculate_attention_scores(
k: Float[torch.Tensor, "batch key_pos kv_head_index d_head"],
) -> Float[torch.Tensor, "batch head_index query_pos key_pos"]:
"""Calculate attention scores from Q and the unexpanded K matrix.
K will be expaned from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.
K will be expanded from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.

Args:
q (Float[torch.Tensor, "batch query_pos head_index d_head"]): The Q tensor.
Expand All @@ -172,7 +172,7 @@ def calculate_z_scores(
pattern: Float[torch.Tensor, "batch head_index query_pos key_pos"],
) -> Float[torch.Tensor, "batch query_pos head_index d_head"]:
"""Calculate z scores from the attention pattern and the unexpanded V matrix.
V will be expaned from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.
V will be expanded from [batch, pos, n_key_value_head, d_head] to [batch, pos, n_query_heads, d_head] using torch.repeat_interleave.

Args:
v (Float[torch.Tensor, "batch query_pos head_index d_head"]): The V tensor.
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/components/t5_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

class T5Block(nn.Module):
"""
T5 decoder Block. Uses T5Layernorm, and T5attention insted of usual ones.
T5 decoder Block. Uses T5Layernorm, and T5attention instead of usual ones.
Also uses cross attention if is_decoder is True.
"""

Expand Down
4 changes: 2 additions & 2 deletions transformer_lens/head_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

SEQ_LEN_ERR = "The sequence must be non-empty and must fit within the model's context window."

DET_PAT_NOT_SQUARE_ERR = "The detection pattern must be a lower triangular matrix of shape (sequence_length, sequence_length); sequence_length=%d; got detection patern of shape %s"
DET_PAT_NOT_SQUARE_ERR = "The detection pattern must be a lower triangular matrix of shape (sequence_length, sequence_length); sequence_length=%d; got detection pattern of shape %s"


def detect_head(
Expand Down Expand Up @@ -87,7 +87,7 @@ def detect_head(
Currently available heads are: `["previous_token_head", "duplicate_token_head",
"induction_head"]`.
heads: If specific attention heads is given here, all other heads' score is set to -1.
Useful for IOI-style circuit analysis. Heads can be spacified as a list tuples (layer,
Useful for IOI-style circuit analysis. Heads can be specified as a list of tuples (layer,
head) or a dictionary mapping a layer to heads within that layer that we want to
analyze. cache: Include the cache to save time if you want.
exclude_bos: Exclude attention paid to the beginning of sequence token.
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/model_bridge/bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2094,7 +2094,7 @@ def cache_hook(tensor: torch.Tensor, *, hook: Any) -> torch.Tensor:
try:
if hasattr(tensor, "detach"):
cache[name] = tensor.detach().to(cache_device)
except:
except Exception:
pass
return tensor

Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/pretrained/weight_conversions/neox.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def convert_neox_weights(neox, cfg: HookedTransformerConfig):
state_dict[f"blocks.{l}.ln1.b"] = neox.gpt_neox.layers[l].input_layernorm.bias

# For some inexplicable reason, NeoX both uses the concatenated QKV
# matmul of GPT-2 (afaict this has a neglible performance impact) AND
# matmul of GPT-2 (afaict this has a negligible performance impact) AND
# has the flattened axis in the DIFFERENT order of (head_index qkv
# d_head) - this took me an hour to debug...
W = neox.gpt_neox.layers[l].attention.query_key_value.weight
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/utilities/defaults_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class LocallyOverridenDefaults:
WARNING: This context manager must be used for any function/method that directly accesses
default values which may be overridden by the user using the function/method's arguments,
e.g., `model.cfg.default_prepend_bos` and `model.tokenizer.padding_side` which can be
overriden by `prepend_bos` and `padding_side` arguments, respectively, in the `to_tokens`.
overridden by `prepend_bos` and `padding_side` arguments, respectively, in the `to_tokens`.
"""

def __init__(self, model, **overrides):
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/utilities/lm_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""lm_utils.

This module contains utility functions related to langauge models
This module contains utility functions related to language models
"""

from __future__ import annotations
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/utilities/logits_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ def sample_logits(
len(tokens.shape) == 2
), "Frequency penalty do not support input in the form of embeddings"
for batch_index in range(final_logits.shape[0]):
# torch.bincount returns a tensor of length d_vocab, with the number of occurences of each token in the tokens.
# torch.bincount returns a tensor of length d_vocab, with the number of occurrences of each token in the tokens.
final_logits[batch_index] = final_logits[
batch_index
] - freq_penalty * torch.bincount(
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/utilities/slice.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Slice.

This module contains the functionailty for the Slice object
This module contains the functionality for the Slice object
"""

from __future__ import annotations
Expand Down
2 changes: 1 addition & 1 deletion transformer_lens/utilities/tensors.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ def get_offset_position_ids(
"""
Returns the indices of non-padded tokens, offset by the position of the first attended token.
"""
# shift the position ids so that the id at the the first attended token position becomes zero.
# shift the position ids so that the id at the first attended token position becomes zero.
# The position ids of the prepending pad tokens are shifted to -1.
shifted_position_ids = attention_mask.cumsum(dim=1) - 1 # [batch, tokens_length]

Expand Down
Loading