NVIDIA · kevalmorabia97 · Jun 15, 2026 · Jun 15, 2026
@@ -43,24 +43,26 @@ python -m pytest tests/gpu/torch/puzzletron/test_puzzletron.py -k "Qwen3-8B"
 
 - For this example we are using 2x NVIDIA H100 80GB HBM3 to show multi-GPU steps. You can use also use a single GPU.
 
-- To make use of [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2), you need to accept the terms and conditions for the corresponding model and the dataset in the Huggingface Hub. Log in to the Huggingface Hub and enter your HF token.
+- To make use of [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Puzzle-KD-Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2), you need to accept the terms and conditions for the corresponding model and the dataset in the Huggingface Hub. Log in to the Huggingface Hub and enter your HF token.
 
 ```bash
 hf auth login --token <your token>
 ```
 
 ## Compress the Model
 
-1. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2).
+1. Download and prepare the dataset.
 
-   dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB)
+   **Default (recommended):** Use the prebuilt [Puzzle-KD-Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2) (~3 GB disk required).
 
    ```bash
    python -m modelopt.torch.puzzletron.dataset.prepare_dataset \
-      --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 \
-      --output_dir path/to/Nemotron-Post-Training-Dataset-v2
+      --dataset_name nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 \
+      --output_dir path/to/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
    ```
 
+   > **Note:** Alternatively, you can derive the dataset from the raw [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) by passing `--dataset_name nvidia/Nemotron-Post-Training-Dataset-v2`. This downloads the full raw dataset (~136 GB) before filtering it down to the same ~2.6 GB result. Only do this if you need to reproduce the preprocessing from scratch.
+
 2. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file.
 
    - `puzzle_dir` indicates a new directory for saving the resulting model.

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/openai/gpt-oss-20b
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for puzzletron outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for puzzletron outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.2-3B-Instruct
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/mistralai/Mistral-Small-24B-Instruct-2501
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/nvidia/Nemotron-Nano-12B-v2
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/Qwen/Qwen2.5-7B-Instruct
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -6,7 +6,7 @@ defaults:
 input_hf_model_path: /workspace/hf_models/Qwen/Qwen3-8B
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2
 
 # Working directory for compression outputs
 puzzle_dir: /workspace/puzzle_dir

@@ -23,6 +23,8 @@
 
 __all__ = ["process_and_save_dataset"]
 
+PREBUILT_KD_DATASET = "nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2"
+
 
 def process_and_save_dataset(
     dataset_name: str,
@@ -40,6 +42,15 @@ def process_and_save_dataset(
             )
             return
 
+    # The prebuilt dataset is already filtered and split — skip the 136 GB download.
+    if dataset_name == PREBUILT_KD_DATASET:
+        ds_dict = datasets.load_dataset(dataset_name)
+        os.makedirs(output_dir, exist_ok=True)
+        ds_dict.save_to_disk(output_dir)
+        mprint(f"Dataset splits:\n{ds_dict}")
+        mprint(f"Saved processed datasets to {output_dir}")
+        return
+
     ds = datasets.load_dataset(dataset_name, split=split)
     ds = datasets.concatenate_datasets(ds)
     # Filter out samples with reasoning = on