deepspeedai · tohtana · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
@@ -292,7 +292,11 @@ def main():
             input_ids = torch.randint(0, actual_vocab_size, (args.batch_size, args.seq_length), device=device)
             labels = torch.randint(0, actual_vocab_size, (args.batch_size, args.seq_length), device=device)
 
-        # Forward pass with optional autocast
+        # Forward pass with optional autocast.
+        # DeepSpeed already applies torch.autocast inside engine.forward(), but
+        # we wrap the entire forward+loss block so that loss_fn also runs under
+        # autocast.  The nested autocast on engine.forward() is harmless —
+        # PyTorch's torch.autocast is idempotent when nested with the same dtype.
         if use_autocast:
             with torch.autocast(device_type="cuda", dtype=autocast_dtype):
                 logits = model_engine(input_ids)