From f00329ae71fe7340bd9c46de075ec4d36f18f2df Mon Sep 17 00:00:00 2001 From: Uttkarsh Singh <89268628+uttkxrrsh@users.noreply.github.com> Date: Sun, 3 May 2026 17:03:13 +0530 Subject: [PATCH] Fix: Replace hardcoded local path for CLIP text model with Hugging Face identifier ### Description This PR addresses an issue in the YOLO-World XL configuration file (`yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py`) where the `text_model_name` was hardcoded to a local absolute path (`/mnt/petrelfs/...`). Leaving this hardcoded breaks the execution for anyone cloning the repository and attempting to run the configuration on their local machines or alternative compute clusters, resulting in a `FileNotFoundError`. ### Changes Made * Restored `'openai/clip-vit-base-patch32'` as the default `text_model_name`. This ensures the Hugging Face `transformers` library can automatically fetch and cache the model weights, making the script plug-and-play for new users. * Commented out the local paths. Users running this on isolated compute nodes (e.g., via Slurm) can still easily override this variable in their local workflows to point to pre-downloaded checkpoints. ### Testing * Verified that setting `text_model_name = 'openai/clip-vit-base-patch32'` correctly initializes the HuggingCLIPLanguageBackbone without requiring a pre-existing local directory. --- ...n_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py index 5bd190bd..a34d2050 100644 --- a/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py +++ b/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py @@ -15,9 +15,9 @@ base_lr = 2e-3 weight_decay = 0.05 / 2 train_batch_size_per_gpu = 16 -text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' -# text_model_name = 'openai/clip-vit-base-patch32' -text_model_name = '/mnt/petrelfs/zhengdian/code/ckpt/clip-vit-base-patch32' +# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection' +text_model_name = 'openai/clip-vit-base-patch32' +# text_model_name = '/mnt/petrelfs/zhengdian/code/ckpt/clip-vit-base-patch32' # scaling model from X to XL deepen_factor = 1.0