diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 1f75d850e84..9adea394993 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -195,9 +195,17 @@ case "$HF_MODEL" in PREPROCESSOR_FEATURE_SIZE="" PREPROCESSOR_OUTPUT="" ;; + SocialLocalMobile/gemma-4-31B-it-HQQ-INT4) + MODEL_NAME="gemma4_31b" + TASK="" + MAX_SEQ_LEN="" + EXTRA_PIP="" + PREPROCESSOR_FEATURE_SIZE="" + PREPROCESSOR_OUTPUT="" + ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/gemma-4-31B-it-HQQ-INT4" exit 1 ;; esac @@ -459,6 +467,50 @@ if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then exit 0 fi +# Gemma 4 31B uses a prequantized checkpoint and custom export script +if [ "$MODEL_NAME" = "gemma4_31b" ]; then + pip install safetensors huggingface_hub gguf + + # Download prequantized model outside OUTPUT_DIR to avoid uploading on failure + LOCAL_MODEL_DIR=$(mktemp -d) + INDUCTOR_CACHE=$(mktemp -d) + trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT + + python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')" + + # Sanity check: run inference on the prequantized model + echo "::group::Inference sanity check" + INFERENCE_OUTPUT=$(python -m executorch.examples.models.gemma4_31b.inference \ + --prequantized "$LOCAL_MODEL_DIR" \ + --prompt "What is the capital of France?" \ + --max-new-tokens 32 \ + --temperature 0 \ + --no-compile 2>&1) + echo "$INFERENCE_OUTPUT" + if ! echo "$INFERENCE_OUTPUT" | grep -q "Paris"; then + echo "ERROR: Inference sanity check failed — expected 'Paris' in output" + exit 1 + fi + echo "::endgroup::" + + # Copy tokenizer for the runner + cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json" + + # Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues) + echo "::group::Export" + TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \ + python -m executorch.examples.models.gemma4_31b.export \ + --prequantized "$LOCAL_MODEL_DIR" \ + --output-dir "${OUTPUT_DIR}" + echo "::endgroup::" + + test -f "${OUTPUT_DIR}/model.pte" + test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd" + ls -al "${OUTPUT_DIR}" + + exit 0 +fi + MAX_SEQ_LEN_ARG="" if [ -n "$MAX_SEQ_LEN" ]; then MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN" diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index 1678b0a4fbb..27b0dd9d597 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -228,9 +228,21 @@ case "$HF_MODEL" in AUDIO_FILE="" IMAGE_PATH="" ;; + SocialLocalMobile/gemma-4-31B-it-HQQ-INT4) + MODEL_NAME="gemma4_31b" + RUNNER_TARGET="gemma4_31b_runner" + RUNNER_PATH="gemma4_31b" + EXPECTED_OUTPUT="Paris" + PREPROCESSOR="" + TOKENIZER_URL="" + TOKENIZER_FILE="tokenizer.json" + AUDIO_URL="" + AUDIO_FILE="" + IMAGE_PATH="" + ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4, SocialLocalMobile/gemma-4-31B-it-HQQ-INT4" exit 1 ;; esac @@ -244,7 +256,7 @@ echo "::group::Prepare $MODEL_NAME Artifacts" # Download tokenizer files (skip for models that bundle tokenizer in export or do not use one) -if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "qwen3_5_moe" ]; then +if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "qwen3_5_moe" ] && [ "$MODEL_NAME" != "gemma4_31b" ]; then if [ "$TOKENIZER_FILE" != "" ]; then curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE else @@ -368,6 +380,9 @@ EOF qwen3_5_moe) RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0 --cuda_graph" ;; + gemma4_31b) + RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 128 --temperature 0 --cuda_graph" + ;; voxtral_realtime) RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0" # Add CUDA data path if present diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 087917c1116..eb7fc5a8939 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -185,6 +185,8 @@ jobs: name: "dinov2-small-imagenet1k-1-layer" - repo: "SocialLocalMobile" name: "Qwen3.5-35B-A3B-HQQ-INT4" + - repo: "SocialLocalMobile" + name: "gemma-4-31B-it-HQQ-INT4" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -204,6 +206,15 @@ jobs: repo: "SocialLocalMobile" name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: "quantized-int4-weight-only" + # Gemma 4 31B uses a prequantized checkpoint, only tile-packed + - model: + repo: "SocialLocalMobile" + name: "gemma-4-31B-it-HQQ-INT4" + quant: "non-quantized" + - model: + repo: "SocialLocalMobile" + name: "gemma-4-31B-it-HQQ-INT4" + quant: "quantized-int4-weight-only" # Voxtral Realtime only supports int4-tile-packed on CUDA - model: repo: "mistralai" @@ -258,7 +269,7 @@ jobs: with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN - runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda gpu-arch-version: 12.6 use-custom-docker-registry: false @@ -315,6 +326,8 @@ jobs: name: "dinov2-small-imagenet1k-1-layer" - repo: "SocialLocalMobile" name: "Qwen3.5-35B-A3B-HQQ-INT4" + - repo: "SocialLocalMobile" + name: "gemma-4-31B-it-HQQ-INT4" quant: - "non-quantized" - "quantized-int4-tile-packed" @@ -334,6 +347,15 @@ jobs: repo: "SocialLocalMobile" name: "Qwen3.5-35B-A3B-HQQ-INT4" quant: "quantized-int4-weight-only" + # Gemma 4 31B uses a prequantized checkpoint, only tile-packed + - model: + repo: "SocialLocalMobile" + name: "gemma-4-31B-it-HQQ-INT4" + quant: "non-quantized" + - model: + repo: "SocialLocalMobile" + name: "gemma-4-31B-it-HQQ-INT4" + quant: "quantized-int4-weight-only" # Voxtral Realtime only supports int4-tile-packed on CUDA - model: repo: "mistralai" @@ -382,7 +404,7 @@ jobs: quant: "non-quantized" with: timeout: 90 - runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} + runner: ${{ (matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' || matrix.model.name == 'gemma-4-31B-it-HQQ-INT4') && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }} gpu-arch-type: cuda gpu-arch-version: 12.6 use-custom-docker-registry: false diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md index 6f567d739b7..94783c8f823 100644 --- a/examples/models/gemma4_31b/README.md +++ b/examples/models/gemma4_31b/README.md @@ -79,6 +79,9 @@ Writes `model.pte` and `model.ptd` into `--output-dir`. ## Eager inference +The prompt is automatically wrapped with the Gemma 4 IT chat template. +Pass `--raw-prompt` to skip template wrapping for pre-formatted input. + ```bash python examples/models/gemma4_31b/inference.py \ --prequantized ./gemma4_31b_int4 \ @@ -109,6 +112,9 @@ The binary lands at `cmake-out/examples/models/gemma4_31b/gemma4_31b_runner`. ## Run the .pte +The prompt is automatically wrapped with the Gemma 4 IT chat template. +Pass `--raw_prompt` to skip template wrapping for pre-formatted input. + ```bash ./gemma4_31b_runner \ --model_path ./gemma4_31b_exports/model.pte \ diff --git a/examples/models/gemma4_31b/inference.py b/examples/models/gemma4_31b/inference.py index 12785450d8c..e1563c04ff6 100644 --- a/examples/models/gemma4_31b/inference.py +++ b/examples/models/gemma4_31b/inference.py @@ -6,12 +6,15 @@ """Eager inference on Gemma 4 31B-IT (CUDA + torch.compile). -Two input paths: +Three input paths: --prequantized