test asyncllm producer and other settings

YeAnbang · YeAnbang · commit 8745e8f4d1ca · 2025-09-19T17:34:47.000+08:00
diff --git a/applications/ColossalChat/coati/distributed/consumer.py b/applications/ColossalChat/coati/distributed/consumer.py
@@ -181,7 +181,6 @@ def loop(self) -> None:
                 for step in pbar:
                     torch.cuda.reset_peak_memory_stats()
                     i = 0
-
                     self.profiler.enter(f"rollout_episode_{episode}_step_{step}")
                     for _ in range(self.num_recv_per_update):
                         if self.n_behind > 0:
@@ -325,6 +324,7 @@ def loop(self) -> None:
                         )  # for setting start index when resuming training
                         if self.rank == 0:
                             print(f"Saved model checkpoint at step {step + 1} in folder {self.save_dir}")
+
                     if (episode != self.num_episodes - 1 or step != self.num_update_per_episode - 1) and (
                         episode != 0 or step >= self.n_behind
                     ):
diff --git a/applications/ColossalChat/coati/distributed/inference_backend.py b/applications/ColossalChat/coati/distributed/inference_backend.py
@@ -251,7 +251,12 @@ def generate(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwar
         micro_batch_input_ids_no_padding = [
             micro_batch_input_ids[i][first_non_padding_token_idx[i] :] for i in range(micro_batch_size)
         ]
-        sample_params = kwargs.get("sample_params", self.sample_params)
+        sample_params = self.sample_params
+        if len(kwargs) > 0:
+            sample_params = self.generate_config.copy()
+            sample_params.update({k: v for k, v in kwargs.items() if k not in ["gt_answer", "test_cases", "labels"]})
+            sample_params.update(self.FORCE_GENERATE_CONFIG)
+            sample_params = SamplingParams(**sample_params)
         outputs = self.llm.generate(
             prompt_token_ids=micro_batch_input_ids_no_padding, sampling_params=sample_params, use_tqdm=False
         )
@@ -358,7 +363,7 @@ async def generate(
             input_ids (torch.Tensor): shape [B, S], B=1
             attention_mask (torch.Tensor): shape [B, S]
         """
-        assert input_ids.size(0) == attention_mask.size(0) == 1
+        assert input_ids.size(0) == attention_mask.size(0) == 1, "AsyncVLLMInferenceBackend only supports batch size 1"
         request_id = (
             str(uuid4()) if not "request_id" in kwargs else kwargs.pop("request_id")
         )  # use fixed request_id to reuse kv cache
@@ -368,7 +373,7 @@ async def generate(
         sample_params = self.sample_params
         if len(kwargs) > 0:
             sample_params = self.generate_config.copy()
-            sample_params.update(kwargs)
+            sample_params.update({k: v for k, v in kwargs.items() if k not in ["gt_answer", "test_cases", "labels"]})
             sample_params.update(self.FORCE_GENERATE_CONFIG)
             sample_params = SamplingParams(**sample_params)
         out_tokens = []
diff --git a/applications/ColossalChat/coati/distributed/launch.py b/applications/ColossalChat/coati/distributed/launch.py
@@ -143,7 +143,7 @@ def launch_distributed(
             tokenizer_config=tokenizer_config,
             microbatch_size=(
                 inference_microbatch_size * num_generations
-                if "async" in inference_backend
+                if "async-agentic" in inference_backend
                 else inference_microbatch_size
             ),
             backend=inference_backend,
diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py
@@ -284,7 +284,6 @@ def sync_data(self, data: Dict[str, torch.Tensor]) -> None:
         ray_broadcast_tensor_dict(data, src=0, device=self.device, group_name=f"sync_data_{self.producer_idx}")
 
     def loop(self) -> None:
-        # breakpoint()
         self.sync_model(0, 0)
         num_update_per_episode = len(self.train_dataloader) // self.num_microbatches
         num_valid_microbatches = num_update_per_episode * self.num_microbatches
@@ -620,10 +619,10 @@ async def generate(self, input_ids, attention_mask, **kwargs):
         rollouts = await asyncio.gather(*tasks)
         rollouts = {
             k: (
-                torch.cat([r[k] for r in rollouts], dim=0)
+                torch.cat([r[k] for r in rollouts], dim=0).cpu()
                 if k not in ["gt_answer", "test_cases"]
                 else [r[k] for r in rollouts]
-            ).cpu()  # CUDA tensor is not serializable by ray
+            )  # CUDA tensor is not serializable by ray
             for k in rollouts[0].keys()
         }
         rollouts["consumer_global_step"] = self.consumer_global_step
@@ -758,8 +757,8 @@ async def loop(self) -> None:
                         self.eval_mode = False
                         self.latest_eval_step = self.consumer_global_step
                 self.profiler.enter("rollout")
-                # breakpoint()
                 outputs = await self.rollout(**batch)
+                outputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v for k, v in outputs.items()}
                 self.profiler.exit("rollout")
                 outputs["temperature"] = torch.tensor(
                     [self.model.generate_config["temperature"]] * outputs["input_ids"].size(0)
@@ -803,6 +802,8 @@ async def loop(self) -> None:
                     outputs.pop("gt_answer")
                 if "test_cases" in outputs:
                     outputs.pop("test_cases")
+                if "consumer_global_step" in outputs:
+                    outputs.pop("consumer_global_step")
                 self.profiler.exit("calculate_reward")
 
                 print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")