diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index dce76866d6d..48954a2b773 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -43,7 +43,7 @@ python -m pytest tests/gpu/torch/puzzletron/test_puzzletron.py -k "Qwen3-8B" - For this example we are using 2x NVIDIA H100 80GB HBM3 to show multi-GPU steps. You can use also use a single GPU. -- To make use of [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2), you need to accept the terms and conditions for the corresponding model and the dataset in the Huggingface Hub. Log in to the Huggingface Hub and enter your HF token. +- To make use of [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) and [Puzzle-KD-Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2), you need to accept the terms and conditions for the corresponding model and the dataset in the Huggingface Hub. Log in to the Huggingface Hub and enter your HF token. ```bash hf auth login --token @@ -51,16 +51,18 @@ hf auth login --token ## Compress the Model -1. Download and prepare the [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2). +1. Download and prepare the dataset. - dataset split: "code", "math", "stem", "chat", excluding reasoning samples (2.62GB) + **Default (recommended):** Use the prebuilt [Puzzle-KD-Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2) (~3 GB disk required). ```bash python -m modelopt.torch.puzzletron.dataset.prepare_dataset \ - --dataset_name nvidia/Nemotron-Post-Training-Dataset-v2 \ - --output_dir path/to/Nemotron-Post-Training-Dataset-v2 + --dataset_name nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 \ + --output_dir path/to/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 ``` + > **Note:** Alternatively, you can derive the dataset from the raw [Nemotron-Post-Training-Dataset-v2](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v2) by passing `--dataset_name nvidia/Nemotron-Post-Training-Dataset-v2`. This downloads the full raw dataset (~136 GB) before filtering it down to the same ~2.6 GB result. Only do this if you need to reproduce the preprocessing from scratch. + 2. Specify the `puzzle_dir`, `input_hf_model_path`, `dataset_path`, `intermediate_size_list`, and `target_memory` arguments in the [llama-3_1-8B_pruneffn_memory.yaml](./configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml) configuration file. - `puzzle_dir` indicates a new directory for saving the resulting model. diff --git a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml index 8ed06e95689..fac942e35ac 100644 --- a/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml +++ b/examples/puzzletron/configs/gptoss-20b_remove_experts_memory/gptoss-20b_remove_experts_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/openai/gpt-oss-20b # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml index ad16dbc5ea0..bfac4ef6944 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/llama-3_1-8B_pruneffn_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for puzzletron outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml index 588df25f27d..2ca0f2c16cf 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for puzzletron outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/llama-3_2-3B_pruneffn_memory/llama-3_2-3B_pruneffn_memory.yaml b/examples/puzzletron/configs/llama-3_2-3B_pruneffn_memory/llama-3_2-3B_pruneffn_memory.yaml index b5303d318a3..879f1cc4a22 100644 --- a/examples/puzzletron/configs/llama-3_2-3B_pruneffn_memory/llama-3_2-3B_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/llama-3_2-3B_pruneffn_memory/llama-3_2-3B_pruneffn_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.2-3B-Instruct # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/mistral-small-24b-instruct-2501_pruneffn_memory/mistral-small-24b-instruct-2501_pruneffn_memory.yaml b/examples/puzzletron/configs/mistral-small-24b-instruct-2501_pruneffn_memory/mistral-small-24b-instruct-2501_pruneffn_memory.yaml index 68a0652d6f1..11f1856ec09 100644 --- a/examples/puzzletron/configs/mistral-small-24b-instruct-2501_pruneffn_memory/mistral-small-24b-instruct-2501_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/mistral-small-24b-instruct-2501_pruneffn_memory/mistral-small-24b-instruct-2501_pruneffn_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/mistralai/Mistral-Small-24B-Instruct-2501 # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/nemotron-nano-12b-v2/nemotron_nano_12b_v2_pruneffn_memory.yaml b/examples/puzzletron/configs/nemotron-nano-12b-v2/nemotron_nano_12b_v2_pruneffn_memory.yaml index 3b880b2c7d1..5bb3273433d 100644 --- a/examples/puzzletron/configs/nemotron-nano-12b-v2/nemotron_nano_12b_v2_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/nemotron-nano-12b-v2/nemotron_nano_12b_v2_pruneffn_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/nvidia/Nemotron-Nano-12B-v2 # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/qwen2_5_7b_instruct_pruneffn_memory/qwen2_5_7b_instruct_pruneffn_memory.yaml b/examples/puzzletron/configs/qwen2_5_7b_instruct_pruneffn_memory/qwen2_5_7b_instruct_pruneffn_memory.yaml index fb961033bc3..d0758ce6167 100644 --- a/examples/puzzletron/configs/qwen2_5_7b_instruct_pruneffn_memory/qwen2_5_7b_instruct_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/qwen2_5_7b_instruct_pruneffn_memory/qwen2_5_7b_instruct_pruneffn_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/Qwen/Qwen2.5-7B-Instruct # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs puzzle_dir: /workspace/puzzle_dir diff --git a/examples/puzzletron/configs/qwen3-8b_pruneffn_memory/qwen3_8b_pruneffn_memory.yaml b/examples/puzzletron/configs/qwen3-8b_pruneffn_memory/qwen3_8b_pruneffn_memory.yaml index 4ee81286dd2..15d8f48afa5 100644 --- a/examples/puzzletron/configs/qwen3-8b_pruneffn_memory/qwen3_8b_pruneffn_memory.yaml +++ b/examples/puzzletron/configs/qwen3-8b_pruneffn_memory/qwen3_8b_pruneffn_memory.yaml @@ -6,7 +6,7 @@ defaults: input_hf_model_path: /workspace/hf_models/Qwen/Qwen3-8B # Dataset path for pruning and NAS scoring -dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Puzzle-KD-Nemotron-Post-Training-Dataset-v2 # Working directory for compression outputs puzzle_dir: /workspace/puzzle_dir diff --git a/modelopt/torch/puzzletron/dataset/prepare_dataset.py b/modelopt/torch/puzzletron/dataset/prepare_dataset.py index 0928b111afc..3d80062ae0f 100644 --- a/modelopt/torch/puzzletron/dataset/prepare_dataset.py +++ b/modelopt/torch/puzzletron/dataset/prepare_dataset.py @@ -23,6 +23,8 @@ __all__ = ["process_and_save_dataset"] +PREBUILT_KD_DATASET = "nvidia/Puzzle-KD-Nemotron-Post-Training-Dataset-v2" + def process_and_save_dataset( dataset_name: str, @@ -40,6 +42,15 @@ def process_and_save_dataset( ) return + # The prebuilt dataset is already filtered and split — skip the 136 GB download. + if dataset_name == PREBUILT_KD_DATASET: + ds_dict = datasets.load_dataset(dataset_name) + os.makedirs(output_dir, exist_ok=True) + ds_dict.save_to_disk(output_dir) + mprint(f"Dataset splits:\n{ds_dict}") + mprint(f"Saved processed datasets to {output_dir}") + return + ds = datasets.load_dataset(dataset_name, split=split) ds = datasets.concatenate_datasets(ds) # Filter out samples with reasoning = on