From f00329ae71fe7340bd9c46de075ec4d36f18f2df Mon Sep 17 00:00:00 2001
From: Uttkarsh Singh <89268628+uttkxrrsh@users.noreply.github.com>
Date: Sun, 3 May 2026 17:03:13 +0530
Subject: [PATCH] Fix: Replace hardcoded local path for CLIP text model with
 Hugging Face identifier

### Description
This PR addresses an issue in the YOLO-World XL configuration file (`yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py`) where the `text_model_name` was hardcoded to a local absolute path (`/mnt/petrelfs/...`).

Leaving this hardcoded breaks the execution for anyone cloning the repository and attempting to run the configuration on their local machines or alternative compute clusters, resulting in a `FileNotFoundError`.

### Changes Made
* Restored `'openai/clip-vit-base-patch32'` as the default `text_model_name`. This ensures the Hugging Face `transformers` library can automatically fetch and cache the model weights, making the script plug-and-play for new users.
* Commented out the local paths. Users running this on isolated compute nodes (e.g., via Slurm) can still easily override this variable in their local workflows to point to pre-downloaded checkpoints.

### Testing
* Verified that setting `text_model_name = 'openai/clip-vit-base-patch32'` correctly initializes the HuggingCLIPLanguageBackbone without requiring a pre-existing local directory.
---
 ...n_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py b/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
index 5bd190bd..a34d2050 100644
--- a/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
+++ b/VBench-2.0/vbench2/third_party/YOLO-World/yolo_world_v2_xl_vlpan_bn_2e-3_100e_4x8gpus_obj365v1_goldg_train_lvis_minival.py
@@ -15,9 +15,9 @@
 base_lr = 2e-3
 weight_decay = 0.05 / 2
 train_batch_size_per_gpu = 16
-text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
-# text_model_name = 'openai/clip-vit-base-patch32'
-text_model_name = '/mnt/petrelfs/zhengdian/code/ckpt/clip-vit-base-patch32'
+# text_model_name = '../pretrained_models/clip-vit-base-patch32-projection'
+text_model_name = 'openai/clip-vit-base-patch32'
+# text_model_name = '/mnt/petrelfs/zhengdian/code/ckpt/clip-vit-base-patch32'
 
 # scaling model from X to XL
 deepen_factor = 1.0