microsoft · crvineeth97 · May 5, 2026 · May 5, 2026
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -47,7 +47,7 @@ protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/downlo
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
 pthreadpool;https://github.com/google/pthreadpool/archive/dcc9f28589066af0dbd4555579281230abbf74dd.zip;533a77943203ef15ca608bcd9dbe2c94da7451d2
 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v3.0.2.zip;a064e663b4d7a337ac291d1bef7337ef4e60a1ae
-pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/403d652dca4c1046e8145950b1c0997a9f748b57.zip;30b2a07fe4bae8574f89176e56274cacdd6d135b
+pytorch_cpuinfo;https://github.com/crvineeth97/cpuinfo/archive/df8c6a8ce5cf12baabe5e7c9213aaeeffb18bd82.zip;34999b2434e49f1a66d50fb62f28663fb8c96881
 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -369,11 +369,7 @@ if (CPUINFO_SUPPORTED)
       URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo}
       EXCLUDE_FROM_ALL
       PATCH_COMMAND
-        ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_cpuinfo_h_for_arm64ec.patch &&
-        # https://github.com/pytorch/cpuinfo/pull/324
-        ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch &&
-        # https://github.com/pytorch/cpuinfo/pull/348
-        ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/win_arm_fp16_detection_fallback.patch
+        ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/patch_cpuinfo_h_for_arm64ec.patch
       FIND_PACKAGE_ARGS NAMES cpuinfo
     )
   else()

diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
@@ -1518,8 +1518,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
 
 endif()
 
-
-  if(onnxruntime_USE_QNN)
+  # Build ep_weight_sharing_ctx_gen for all supported EPs (QNN, TensorRT, OpenVINO, VitisAI)
+  if(onnxruntime_USE_QNN OR onnxruntime_USE_TENSORRT OR onnxruntime_USE_OPENVINO OR onnxruntime_USE_VITISAI)
     #qnn ctx generator
     set(ep_weight_sharing_ctx_gen_src_dir ${TEST_SRC_DIR}/ep_weight_sharing_ctx_gen)
     set(ep_weight_sharing_ctx_gen_src_patterns

diff --git a/cmake/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch b/cmake/patches/cpuinfo/patch_vcpkg_arm64ec_support.patch
diff --git a/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch b/cmake/patches/cpuinfo/win_arm_fp16_detection_fallback.patch
diff --git a/cmake/vcpkg-ports/cpuinfo/patch_vcpkg_arm64ec_support.patch b/cmake/vcpkg-ports/cpuinfo/patch_vcpkg_arm64ec_support.patch
diff --git a/cmake/vcpkg-ports/cpuinfo/portfile.cmake b/cmake/vcpkg-ports/cpuinfo/portfile.cmake
@@ -5,14 +5,12 @@ endif()
 
 vcpkg_from_github(
     OUT_SOURCE_PATH SOURCE_PATH
-    REPO pytorch/cpuinfo
-    REF 403d652dca4c1046e8145950b1c0997a9f748b57
-    SHA512 f7cd6dc44bd1120af610cae1337ed4c0f557ba78d2de9c73fed350fa3dfe9512643a1619ae55f5a540c6316a87d641856cca27297bb8766e48f39b7b7a59da1f
-    HEAD_REF master
+    REPO crvineeth97/cpuinfo
+    REF df8c6a8ce5cf12baabe5e7c9213aaeeffb18bd82
+    SHA512 0  # TODO: update SHA512 after PR merges to pytorch/cpuinfo
+    HEAD_REF vchelur/add-cpuinfo-deinitialize
     PATCHES
         patch_cpuinfo_h_for_arm64ec.patch
-        patch_vcpkg_arm64ec_support.patch       # https://github.com/pytorch/cpuinfo/pull/324
-        win_arm_fp16_detection_fallback.patch   # https://github.com/pytorch/cpuinfo/pull/348
 )
 
 vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS

diff --git a/cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch b/cmake/vcpkg-ports/cpuinfo/win_arm_fp16_detection_fallback.patch
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -207,7 +207,7 @@ class GQAAttentionBase {
         const size_t batch_index = i / num_heads_;
         const size_t head_index = i % num_heads_;
         const size_t total_seqlen = SafeInt<size_t>(seqlens_k[batch_index]) + 1;
-        const size_t past_seqlen = is_prompt ? 0 : total_seqlen - sequence_length;  // Assume no padding sequence length
+        const size_t past_seqlen = is_prompt ? 0 : (total_seqlen > sequence_length ? total_seqlen - sequence_length : 0);  // Assume no padding sequence length
         const size_t past_chunk_length = SafeInt<size_t>(past_seqlen) * head_size;
 
         const ptrdiff_t output_offset = SafeInt<ptrdiff_t>(i) * sequence_length * present_buffer_sequence_length;
@@ -441,7 +441,7 @@ class GQAAttentionBase {
         const size_t batch_index = i / num_heads_;
         const size_t head_index = i % num_heads_;
         const size_t total_seqlen = SafeInt<size_t>(seqlens_k[batch_index]) + 1;
-        const size_t past_seqlen = is_prompt ? 0 : total_seqlen - sequence_length;  // Assume no padding sequence length
+        const size_t past_seqlen = is_prompt ? 0 : (total_seqlen > sequence_length ? total_seqlen - sequence_length : 0);  // Assume no padding sequence length
         const size_t past_chunk_length = SafeInt<size_t>(past_seqlen) * head_size;
 
         const T* v;

diff --git a/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cpu/bert/group_query_attention.cc
@@ -173,11 +173,20 @@ Status GroupQueryAttention<T>::Compute(OpKernelContext* context) const {
       for (int b = 0; b < batch_size; b++) {
         const int total_seqlen = seqlens_k->Data<int32_t>()[b] + 1;
         const int past_seqlen = total_seqlen - sequence_length;
-        for (int s = 0; s < sequence_length; s++) {
-          if (past_seqlen + s < total_seqlen) {
-            default_pos_ids[b * sequence_length + s] = static_cast<int64_t>(past_seqlen) + s;
-          } else {
-            default_pos_ids[b * sequence_length + s] = static_cast<int64_t>(1);
+
+        // Handle inconsistent random data in seqlens_k, when past_seqlen becomes negative
+        if (past_seqlen < 0) {
+          // Fallback: generate consecutive position IDs starting from 0
+          for (int s = 0; s < sequence_length; s++) {
+            default_pos_ids[b * sequence_length + s] = static_cast<int64_t>(s);
+          }
+        } else {
+          for (int s = 0; s < sequence_length; s++) {
+            if (past_seqlen + s < total_seqlen) {
+              default_pos_ids[b * sequence_length + s] = static_cast<int64_t>(past_seqlen) + s;
+            } else {
+              default_pos_ids[b * sequence_length + s] = static_cast<int64_t>(1);
+            }
           }
         }
       }

diff --git a/onnxruntime/core/common/cpuid_info.cc b/onnxruntime/core/common/cpuid_info.cc
@@ -378,4 +378,15 @@ CPUIDInfo::CPUIDInfo() {
 #endif
 #endif  // defined(CPUIDINFO_ARCH_ARM)
 }
+
+void CPUIDInfo::ShutDown() {
+#if defined(CPUINFO_SUPPORTED)
+  static bool is_shutdown = false;
+  if (!is_shutdown && pytorch_cpuinfo_init_) {
+    cpuinfo_deinitialize();
+    pytorch_cpuinfo_init_ = false;
+    is_shutdown = true;
+  }
+#endif
+}
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/common/cpuid_info.h b/onnxruntime/core/common/cpuid_info.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <atomic>
 #include "core/common/common.h"
 #include "core/common/cpuid_arch_definition.h"
 
@@ -11,8 +12,7 @@ namespace onnxruntime {
 class CPUIDInfo {
  public:
   static const CPUIDInfo& GetCPUIDInfo() {
-    static CPUIDInfo cpuid_info;
-    return cpuid_info;
+    return Instance();
   }
 
   std::string_view GetCPUVendor() const {
@@ -104,13 +104,32 @@ class CPUIDInfo {
     return has_fp16_;
   }
 
+  static void ShutdownCpuInfo() {
+    // Don't create the singleton during DLL unload.
+    if (!InstanceCreated().load(std::memory_order_acquire)) return;
+    Instance().ShutDown();
+  }
+
  private:
   // Log function that uses ORT logging if available or writes to stderr.
   // This enables us to log even before ORT logging has been initialized.
   static void LogEarlyWarning(std::string_view message);
 
   CPUIDInfo();
 
+  static std::atomic<bool>& InstanceCreated() {
+    static std::atomic<bool> created{false};
+    return created;
+  }
+
+  static CPUIDInfo& Instance() {
+    static CPUIDInfo cpuid_info;
+    InstanceCreated().store(true, std::memory_order_release);
+    return cpuid_info;
+  }
+
+  void ShutDown();
+
   void VendorInfoInit();
 
 #if defined(CPUIDINFO_ARCH_X86)