Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions candle-transformers/src/models/qwen3_vl/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,22 @@ impl Qwen3VLModel {
seqlen_offsets: &[usize],
) -> Result<Tensor> {
let (bs, seqlen) = input_ids.dims2()?;
// The causal mask is only needed when processing more than one token at once
// (prefill / multi-token prompt). For single-token autoregressive decode steps
// we can rely on the KV cache and skip building the mask. The previous logic
// had this condition inverted (issue #3505), which built the mask only for
// single-token steps and skipped it during prefill — leaving the model with
// no attention mask when it actually needed one.
let attention_mask = if seqlen <= 1 {
None
} else {
Some(self.prepare_decoder_attention_mask(
bs,
seqlen,
seqlen_offsets[0],
self.text.dtype,
input_ids.device(),
)?)
} else {
None
};

let mut input_embeds = self.text.embed_tokens(input_ids)?;
Expand Down