diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 83d1751e55543..dde6d44919092 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -638,6 +638,7 @@ else()
   check_cxx_compiler_flag(-Wcatch-value HAS_CATCH_VALUE)
   check_cxx_compiler_flag(-Wclass-memaccess HAS_CLASS_MEMACCESS)
   check_cxx_compiler_flag(-Wcharacter-conversion HAS_CHARACTER_CONVERSION)
+  check_cxx_compiler_flag(-Wno-error=character-conversion HAS_NO_ERROR_CHARACTER_CONVERSION)
   check_cxx_compiler_flag(-Wdangling-reference HAS_DANGLING_REFERENCE)
   check_cxx_compiler_flag(-Wdeprecated-anon-enum-enum-conversion HAS_DEPRECATED_ANON_ENUM_ENUM_CONVERSION)
   check_cxx_compiler_flag(-Wdeprecated-builtins HAS_DEPRECATED_BUILTINS)
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index ae6253cecdc03..caaee84d32acb 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -636,9 +636,13 @@ else()
             enable_language(ASM)
             check_cxx_source_compiles("
               #ifdef _AIX
+              #include <sys/systemcfg.h>
+              #if !defined(POWER_10)
               #define POWER_10       0x40000
+              #endif
+              #if !defined(POWER_10_ANDUP)
               #define POWER_10_ANDUP (POWER_10)
-              #include <sys/systemcfg.h>
+              #endif
               #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
               int main() {
                 bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31);
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 7f361aa63921f..a061858fa068f 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -50,6 +50,13 @@ function(filter_test_srcs test_srcs_var)
 endfunction()
 
 set(disabled_warnings)
+
+function(onnxruntime_disable_gtest_character_conversion_as_error target_name)
+  if (HAS_NO_ERROR_CHARACTER_CONVERSION)
+    target_compile_options(${target_name} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=character-conversion>")
+  endif()
+endfunction()
+
 function(AddTest)
   cmake_parse_arguments(_UT "DYN" "TARGET" "LIBS;SOURCES;DEPENDS;TEST_ARGS" ${ARGN})
   list(REMOVE_DUPLICATES _UT_SOURCES)
@@ -170,9 +177,7 @@ function(AddTest)
     if (${HAS_NOERROR})
       target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=uninitialized>")
     endif()
-    if (${HAS_CHARACTER_CONVERSION})
-      target_compile_options(${_UT_TARGET} PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=character-conversion>")
-    endif()
+    onnxruntime_disable_gtest_character_conversion_as_error(${_UT_TARGET})
   endif()
 
   set(TEST_ARGS ${_UT_TEST_ARGS})
@@ -847,9 +852,7 @@ if(MSVC)
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd6326>")
 else()
   target_include_directories(onnxruntime_test_utils PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT})
-  if (HAS_CHARACTER_CONVERSION)
-    target_compile_options(onnxruntime_test_utils PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:-Wno-error=character-conversion>")
-  endif()
+  onnxruntime_disable_gtest_character_conversion_as_error(onnxruntime_test_utils)
 endif()
 if (onnxruntime_USE_NCCL)
   target_include_directories(onnxruntime_test_utils PRIVATE ${NCCL_INCLUDE_DIRS})
diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
index fb6a4eb22a872..ca072113e832d 100644
--- a/docs/ContribOperators.md
+++ b/docs/ContribOperators.md
@@ -3569,7 +3569,6 @@ This version of the operator has been available since version 1 of the 'com.micr
 ### <a name="com.microsoft.NhwcFusedConv"></a><a name="com.microsoft.nhwcfusedconv">**com.microsoft.NhwcFusedConv**</a>
 
   NhwcFusedConv is a Conv operator with optional activation and add operators fused in.
-  Only has fp16 implementation as of 2023/04/15.
 
 #### Version
 
@@ -3600,26 +3599,26 @@ This version of the operator has been available since version 1 of the 'com.micr
 
 <dl>
 <dt><tt>X</tt> : T</dt>
-<dd></dd>
+<dd>Input activation tensor in channels-last layout. For 2D convolution this is [N, H, W, C], where N is batch size, H/W are spatial dimensions, and C is the number of input channels.</dd>
 <dt><tt>W</tt> : T</dt>
-<dd></dd>
+<dd>Convolution weight tensor in the standard ONNX Conv filter layout [M, C/group, kH, kW], where M is the number of output channels.</dd>
 <dt><tt>B</tt> (optional) : T</dt>
-<dd></dd>
+<dd>Optional 1D bias tensor of shape [M].</dd>
 <dt><tt>Z</tt> (optional) : T</dt>
-<dd>Tensor to be added to the output, must be the same shape and format as the output tensor.</dd>
+<dd>Optional residual/add tensor in the same channels-last layout and shape as the output tensor Y. For 2D convolution this is [N, out_H, out_W, M].</dd>
 </dl>
 
 #### Outputs
 
 <dl>
 <dt><tt>Y</tt> : T</dt>
-<dd></dd>
+<dd>Output tensor in channels-last layout. For 2D convolution this is [N, out_H, out_W, M], where M is the number of output channels.</dd>
 </dl>
 
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float16), tensor(float)</dt>
 <dd>Constrain input and output types to float tensors</dd>
 </dl>
 
diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md
index 5cf92c6c53e4b..d163301189875 100644
--- a/docs/OperatorKernels.md
+++ b/docs/OperatorKernels.md
@@ -810,7 +810,8 @@ The **OpSet Version** column uses the following notation:
 |||[9, 12]|**T1** = tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 |LRN|*in* X:**T**<br> *out* Y:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 12]|**T** = tensor(double), tensor(float), tensor(float16)|
-|LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|14+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
+|LSTM|*in* X:**T**<br> *in* W:**T**<br> *in* R:**T**<br> *in* B:**T**<br> *in* sequence_lens:**T1**<br> *in* initial_h:**T**<br> *in* initial_c:**T**<br> *in* P:**T**<br> *out* Y:**T**<br> *out* Y_h:**T**<br> *out* Y_c:**T**|22+|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
+|||[14, 21]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |||[7, 13]|**T** = tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(int32)|
 |LayerNormalization|*in* X:**T**<br> *in* Scale:**T**<br> *in* B:**T**<br> *out* Y:**T**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**<br><br>or<br><br>*in* X:**T**<br> *in* Scale:**V**<br> *in* B:**V**<br> *out* Y:**V**<br> *out* Mean:**U**<br> *out* InvStdDev:**U**|17+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(float)|
 |||[1, 16]|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **U** = tensor(double), tensor(float)<br/> **V** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
index 24557bb81bce3..9d61165927d8c 100644
--- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
+++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -124,6 +124,18 @@ static const char* const kOrtSessionOptionsMemoryOptimizerProbeConfig = "optimiz
 // Default is an empty string which means no optimizers are disabled.
 static const char* const kOrtSessionOptionsDisableSpecifiedOptimizers = "optimization.disable_specified_optimizers";
 
+// Maximum total output size in bytes that the constant folding optimizer is allowed to produce per node.
+// Prevents malicious models from causing excessive memory allocation during optimization.
+// If the estimated or actual output size of a constant-foldable node exceeds this limit, the node will
+// not be constant folded and will instead be executed at runtime.
+//
+// Option values:
+// - A positive integer (as string): Maximum allowed output size in bytes per constant-folded node.
+//   Default is "1073741824" (1 GB).
+// - "0": Disable the size limit (not recommended for untrusted models).
+static const char* const kOrtSessionOptionsConstantFoldingMaxOutputSizeInBytes =
+    "optimization.constant_folding_max_output_size_in_bytes";
+
 // It controls whether to run graph optimizations in loop or not.
 //
 // "0": disable. Graph Optimization Loop is disabled.
diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
index fad8d9275c555..b7590a2d6a547 100644
--- a/onnxruntime/contrib_ops/cpu/bert/attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/attention_base.h
@@ -6,6 +6,7 @@
 #include <array>
 #include <vector>
 #include "core/common/common.h"
+#include "core/common/narrow.h"
 #include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h"
 #ifndef SHARED_PROVIDER
 #include "core/framework/op_kernel.h"
@@ -57,11 +58,11 @@ class AttentionBase {
   AttentionBase(const KernelInfoType& info, bool require_same_hidden_size) {
     int64_t num_heads = 0;
     ORT_ENFORCE(info.GetAttr("num_heads", &num_heads).IsOK() && num_heads > 0);
-    num_heads_ = static_cast<int>(num_heads);
+    num_heads_ = narrow<int>(num_heads);
 
     is_unidirectional_ = info.template GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
     do_rotary_ = info.template GetAttrOrDefault<int64_t>("do_rotary", 0) == 1;
-    rotary_embedding_ = static_cast<int>(info.template GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+    rotary_embedding_ = narrow<int>(info.template GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
     mask_filter_value_ = info.template GetAttrOrDefault<float>("mask_filter_value", -10000.0f);
     scale_ = info.template GetAttrOrDefault<float>("scale", 0.0f);
     if (!info.template GetAttrs<int64_t>("qkv_hidden_sizes", qkv_hidden_sizes_).IsOK()) {
@@ -222,6 +223,14 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
                            "Input 'bias' dimension 0 should have same length as dimension 1 of input 'weights'");
   }
 
+  // Q, K, V are packed along bias_dims[0]. When their hidden sizes are required to be equal,
+  // bias_dims[0] == 3 * hidden_size must be a multiple of 3.
+  if (require_same_hidden_size_ && bias_dims[0] % 3 != 0) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "Input 'bias' dimension 0 (", bias_dims[0],
+                           ") must be a multiple of 3 (Q, K, V are packed and have equal hidden sizes).");
+  }
+
   int64_t q_hidden_size = bias_dims[0] / static_cast<int64_t>(3);
   int64_t k_hidden_size = q_hidden_size;
   int64_t v_hidden_size = k_hidden_size;
@@ -241,6 +250,10 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
     q_hidden_size = qkv_hidden_sizes_[0];
     k_hidden_size = qkv_hidden_sizes_[1];
     v_hidden_size = qkv_hidden_sizes_[2];
+  } else if (q_hidden_size % num_heads_ != 0) {
+    // Match the error message produced by the qkv_hidden_sizes path above.
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "hidden_size should be divisible by num_heads:", q_hidden_size);
   }
 
   int64_t kv_sequence_length = sequence_length;
@@ -282,14 +295,14 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
                              "Inputs 'past' dimension 1 shall have same length as dimension 0 of input 0");
     }
 
-    if (static_cast<int>(past_dims[2]) != num_heads_) {
+    if (past_dims[2] != num_heads_) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              "Inputs 'past' dimension 2 shall have length of num_heads", num_heads_);
     }
 
-    if (static_cast<int>(past_dims[4]) != k_hidden_size / num_heads_) {
+    if (past_dims[4] != k_hidden_size / num_heads_) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                             "Inputs 'past' dimension 2 shall have length of ", k_hidden_size / num_heads_);
+                             "Inputs 'past' dimension 4 shall have length of ", k_hidden_size / num_heads_);
     }
 
     if (!past_present_share_buffer_) {
@@ -348,17 +361,17 @@ inline Status AttentionBase::CheckInputs(const TensorShape& input_shape,
 
   if (parameters != nullptr) {
     AttentionParameters* output_parameters = reinterpret_cast<AttentionParameters*>(parameters);
-    output_parameters->batch_size = static_cast<int>(batch_size);
-    output_parameters->sequence_length = static_cast<int>(sequence_length);
-    output_parameters->past_sequence_length = static_cast<int>(past_sequence_length);
-    output_parameters->kv_sequence_length = static_cast<int>(kv_sequence_length);
-    output_parameters->total_sequence_length = static_cast<int>(total_sequence_length);
-    output_parameters->max_sequence_length = static_cast<int>(max_sequence_length);
-    output_parameters->input_hidden_size = static_cast<int>(input_hidden_size);
-    output_parameters->hidden_size = static_cast<int>(q_hidden_size);
-    output_parameters->v_hidden_size = static_cast<int>(v_hidden_size);
-    output_parameters->head_size = static_cast<int>(q_hidden_size) / num_heads_;
-    output_parameters->v_head_size = static_cast<int>(v_hidden_size) / num_heads_;
+    output_parameters->batch_size = narrow<int>(batch_size);
+    output_parameters->sequence_length = narrow<int>(sequence_length);
+    output_parameters->past_sequence_length = narrow<int>(past_sequence_length);
+    output_parameters->kv_sequence_length = narrow<int>(kv_sequence_length);
+    output_parameters->total_sequence_length = narrow<int>(total_sequence_length);
+    output_parameters->max_sequence_length = narrow<int>(max_sequence_length);
+    output_parameters->input_hidden_size = narrow<int>(input_hidden_size);
+    output_parameters->hidden_size = narrow<int>(q_hidden_size);
+    output_parameters->v_hidden_size = narrow<int>(v_hidden_size);
+    output_parameters->head_size = narrow<int>(q_hidden_size) / num_heads_;
+    output_parameters->v_head_size = narrow<int>(v_hidden_size) / num_heads_;
     output_parameters->num_heads = num_heads_;
     output_parameters->is_unidirectional = is_unidirectional_;
     output_parameters->past_present_share_buffer = (past_present_share_buffer_ != 0 && past != nullptr);
@@ -398,7 +411,7 @@ inline Tensor* AttentionBase::GetPresent(TOpKernelContext* context,
                                          int head_size,
                                          int kv_sequence_length,
                                          int& past_sequence_length) const {
-  past_sequence_length = (nullptr != past) ? static_cast<int>(past->Shape().GetDims()[3]) : 0;
+  past_sequence_length = (nullptr != past) ? narrow<int>(past->Shape().GetDims()[3]) : 0;
   std::array<int64_t, 5> present_dims{2, batch_size, num_heads_,
                                       static_cast<int64_t>(kv_sequence_length) + past_sequence_length, head_size};
 
diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index cc652ed52ee72..0749457f5a182 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -20,6 +20,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv);
+#ifdef USE_KLEIDIAI
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv);
+#endif
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm);
 #if !defined(DISABLE_GENERATION_OPS)
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch);
@@ -313,6 +316,9 @@ Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, EmbedLayerNormalization)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, ExpandDims)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedConv)>,
+#ifdef USE_KLEIDIAI
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, NhwcFusedConv)>,
+#endif
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, FusedGemm)>,
 #if !defined(DISABLE_GENERATION_OPS)
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float, GreedySearch)>,
diff --git a/onnxruntime/contrib_ops/cpu/fused_conv.cc b/onnxruntime/contrib_ops/cpu/fused_conv.cc
index 5374222dbabcc..76451232421da 100644
--- a/onnxruntime/contrib_ops/cpu/fused_conv.cc
+++ b/onnxruntime/contrib_ops/cpu/fused_conv.cc
@@ -26,5 +26,18 @@ ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
         .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     FusedConvFloat);
 
+#ifdef USE_KLEIDIAI
+ONNX_CPU_OPERATOR_TYPED_MS_KERNEL(
+    NhwcFusedConv,
+    1,
+    float,
+    KernelDefBuilder()
+        // Allow the optional "sum" input (index 3) to be reused as the output buffer (index 0),
+        // consistent with the FusedConv kernel registration.
+        .MayInplace(3, 0)
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
+    FusedConvFloat);
+#endif
+
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
index 931677582d469..c07acbfcb0e47 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/attention_quant.cc
@@ -7,6 +7,7 @@
 #include "core/util/math.h"
 #include "core/util/qmath.h"
 #include "core/util/math_cpuonly.h"
+#include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/platform/threadpool.h"
 #include "core/mlas/inc/mlas.h"
@@ -71,8 +72,8 @@ Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr
     return Status::OK();
   }
 
-  const size_t input_hidden_size = static_cast<size_t>(weights_dims[0]);
-  const size_t hidden_size_x3 = static_cast<size_t>(weights_dims[1]);
+  const size_t input_hidden_size = narrow<size_t>(weights_dims[0]);
+  const size_t hidden_size_x3 = narrow<size_t>(weights_dims[1]);
   const size_t hidden_size = hidden_size_x3 / 3;
   const size_t head_size = hidden_size / num_heads_;
 
@@ -89,8 +90,8 @@ Status QAttention<T>::PrePack(const Tensor& weights, int input_idx, AllocatorPtr
     return Status::OK();
   }
 
-  const size_t loop_len = 3 * static_cast<size_t>(num_heads_);
-  size_t packed_weights_data_size = packed_weights_size_ * loop_len;
+  const size_t loop_len = SafeInt<size_t>(3) * num_heads_;
+  size_t packed_weights_data_size = SafeInt<size_t>(packed_weights_size_) * loop_len;
 
   packed_weights_ = IAllocator::MakeUniquePtr<void>(alloc, packed_weights_data_size, true);
   std::byte* packed_weights_data = static_cast<std::byte*>(packed_weights_.get());
@@ -171,12 +172,6 @@ Status QAttention<T>::Compute(OpKernelContext* context) const {
   T input_scale = *(input_scale_tensor->Data<T>());
 
   bool is_weight_scale_per_column = !IsScalarOr1ElementVector(weight_scale_tensor);
-  const T* weight_scale_data = weight_scale_tensor->Data<T>();
-
-  std::vector<T> dequant_scales(weight_scale_data, weight_scale_data + weight_scale_tensor->Shape().Size());
-  std::for_each(dequant_scales.begin(), dequant_scales.end(), [&input_scale](float& dequant_scale) {
-    return dequant_scale *= input_scale;
-  });
 
   uint8_t input_zero_point = 0;
   if (i_zp_tensor != nullptr) {
@@ -194,14 +189,42 @@ Status QAttention<T>::Compute(OpKernelContext* context) const {
   }
 
   const auto& shape = input->Shape();
-  const int batch_size = static_cast<int>(shape[0]);
-  const int sequence_length = static_cast<int>(shape[1]);
-  const int input_hidden_size = static_cast<int>(shape[2]);
-
-  const auto hidden_size_x3 = weights_shape.GetDims()[1];
-  const int hidden_size = static_cast<int>(hidden_size_x3) / 3;
+  const int batch_size = narrow<int>(shape[0]);
+  const int sequence_length = narrow<int>(shape[1]);
+  const int input_hidden_size = narrow<int>(shape[2]);
+
+  const int hidden_size_x3 = narrow<int>(weights_shape.GetDims()[1]);
+  // AttentionBase::CheckInputs verifies that weights_dims[1] (== bias_dims[0]) is a multiple of 3
+  // and that hidden_size = bias_dims[0] / 3 is divisible by num_heads_.
+  const int hidden_size = hidden_size_x3 / 3;
   const int head_size = hidden_size / num_heads_;
 
+  // Validate per-column 'weight_scale' / 'weight_zero_point' shapes against the expected
+  // 3 * hidden_size. Without this check, a malicious or malformed model can supply a
+  // smaller per-column tensor and cause an out-of-bounds read in the GEMM loop below
+  // (which indexes scales/zero-points using offsets up to ~3 * hidden_size - head_size).
+  if (is_weight_scale_per_column) {
+    ORT_RETURN_IF_NOT(weight_scale_tensor->Shape().NumDimensions() == 1 &&
+                          weight_scale_tensor->Shape().Size() == hidden_size_x3,
+                      "Input 'weight_scale' must be a scalar or a 1D tensor of size 3 * hidden_size (= ",
+                      hidden_size_x3, "), got shape ", weight_scale_tensor->Shape().ToString());
+  }
+
+  if (is_weight_zp_per_column) {
+    ORT_RETURN_IF_NOT(w_zp_tensor->Shape().NumDimensions() == 1 &&
+                          w_zp_tensor->Shape().Size() == hidden_size_x3,
+                      "Input 'weight_zero_point' must be a scalar or a 1D tensor of size 3 * hidden_size (= ",
+                      hidden_size_x3, "), got shape ", w_zp_tensor->Shape().ToString());
+  }
+
+  // Build the dequantization scales after shape validation so that malformed
+  // inputs are rejected before any allocation/copy work.
+  const T* weight_scale_data = weight_scale_tensor->Data<T>();
+  std::vector<T> dequant_scales(weight_scale_data, weight_scale_data + weight_scale_tensor->Shape().Size());
+  std::for_each(dequant_scales.begin(), dequant_scales.end(), [&input_scale](T& dequant_scale) {
+    return dequant_scale *= input_scale;
+  });
+
   std::vector<int64_t> output_shape(3);
   output_shape[0] = shape[0];
   output_shape[1] = shape[1];
@@ -216,16 +239,17 @@ Status QAttention<T>::Compute(OpKernelContext* context) const {
   auto* tp = context->GetOperatorThreadPool();
   // STEP.1: gemm_data(BS, 3NH) = Scale(input(BS, D) x weights(D, 3NH)) + bias(3NH)
   // D is hidden dimension of input, where input_hidden_size (D) could be larger than hidden_size (NH) when model is pruned.
-  auto gemm_data = allocator->Alloc(SafeInt<size_t>(batch_size) * sequence_length * 3 * hidden_size * element_size);
+  const auto batch_size_x_sequence_length_x_hidden_size = SafeInt<size_t>(batch_size) * sequence_length * hidden_size;
+  void* gemm_data = allocator->Alloc(batch_size_x_sequence_length_x_hidden_size * 3 * element_size);
   BufferUniquePtr gemm_buffer(gemm_data, BufferDeleter(std::move(allocator)));
 
   auto Q = reinterpret_cast<T*>(gemm_data);
-  auto K = Q + static_cast<int64_t>(batch_size) * sequence_length * hidden_size;
-  auto V = K + static_cast<int64_t>(batch_size) * sequence_length * hidden_size;
+  auto K = Q + static_cast<size_t>(batch_size_x_sequence_length_x_hidden_size);
+  auto V = K + static_cast<size_t>(batch_size_x_sequence_length_x_hidden_size);
   T* QKV[3] = {Q, K, V};
 
   {
-    const int loop_len = 3 * batch_size * num_heads_;
+    const int loop_len = SafeInt<int>(3) * batch_size * num_heads_;
     const auto* input_data = input->Data<uint8_t>();
     const auto* bias_data = bias->Data<T>();
 
@@ -243,16 +267,17 @@ Status QAttention<T>::Compute(OpKernelContext* context) const {
     scale_bias_procs.reserve(loop_len);
 
     for (int i = 0; i < loop_len; i++) {
-      const int batch_index = static_cast<int>((i / 3) / num_heads_);
-      const int head_index = static_cast<int>((i / 3) % num_heads_);
-      const int qkv_index = static_cast<int>(i % 3);
+      const int batch_index = (i / 3) / num_heads_;
+      const int head_index = (i / 3) % num_heads_;
+      const int qkv_index = i % 3;
 
-      int input_offset = batch_index * sequence_length * input_hidden_size;
+      int input_offset = SafeInt<int>(batch_index) * sequence_length * input_hidden_size;
       int weights_offset = qkv_index * hidden_size + head_index * head_size;
       int weights_scale_offset = is_weight_scale_per_column ? weights_offset : 0;
       int weights_zp_offset = is_weight_zp_per_column ? weights_offset : 0;
       float* qkv_dest = QKV[qkv_index];
-      int qkv_offset = (batch_index * num_heads_ + head_index) * (sequence_length * head_size);
+      int qkv_offset = (SafeInt<int>(batch_index) * num_heads_ + head_index) *
+                       (SafeInt<int>(sequence_length) * head_size);
 
       //                   original           transposed            iteration
       // A: input          (BxSxD)            (B.)S x D             S x D
diff --git a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
index 6458bbea0d2e4..27fa56e333874 100644
--- a/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
+++ b/onnxruntime/contrib_ops/webgpu/bert/flash_attention.h
@@ -92,11 +92,12 @@ class FlashAttentionProgram final : public Program<FlashAttentionProgram> {
         q_BNSH_(q_BNSH),
         use_seqlen_k_(use_seqlen_k),
         has_head_sink_(has_head_sink) {
-    if (is_apple) {
-      // On Apple GPUs, use an optimized loop-based path with dynamic max_k_step.
-      // Compute max_k_step from shared memory budget: k_tile + v_tile = 2 * element_size * head_size * max_k_step
+    if (is_apple || is_nvidia) {
+      // On Apple and NVIDIA, use an optimized loop-based path with dynamic max_k_step.
+      // Compute max_k_step from workgroup shared memory budget: k_tile + v_tile = 2 * element_size * head_size * max_k_step
       const int element_size = is_fp16 ? 2 : 4;
-      int max_k_from_shm = 16384 / (2 * element_size * qkv_head_size);
+      constexpr int kMinWorkgroupStorageBudgetBytes = 16384;
+      int max_k_from_shm = kMinWorkgroupStorageBudgetBytes / (2 * element_size * qkv_head_size);
       if (max_k_from_shm >= 32) {
         max_k_step_ = 32;
       } else {
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 8030690e7c92d..59efae597ceb2 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -154,19 +154,58 @@ Status IExecutionFrame::GetOrCreateNodeOutputMLValue(const int output_index, int
     p_ort_value = &all_values_[ort_value_idx];
 
     if (p_ort_value->IsAllocated()) {
-      // already allocated. verify shape matches if tensor.
+      // The OrtValue at this index is already allocated. This happens when the caller provides
+      // a pre-allocated OrtValue as an output for Run(). IExecutionFrame::Init() populates
+      // all_values_ with caller-provided outputs (fetches) before any kernel executes.
+      // Each NodeArg in an ONNX graph has exactly one producer, so at this point no kernel
+      // in the current run could have written here — the only source is a value placed
+      // during Init().
+      //
+      // When the shapes match, we reuse the caller's buffer (zero-cost for repeated runs
+      // with the same shapes).
+      //
+      // When they differ, it means the caller supplied an output OrtValue whose shape does
+      // not match what the kernel computed for this run. This typically happens when the
+      // caller reuses the output OrtValue array across Run() calls with different input
+      // shapes on a model with dynamic dimensions. The caller should either supply
+      // unallocated output OrtValues or ensure the pre-allocated shape matches.
+      //
+      // Pre-run validation (ValidateInputsOutputs in inference_session.cc) catches
+      // structural mismatches (element type, rank, fixed dimensions) before execution
+      // begins. Only dynamic dimension differences reach this point, since the actual
+      // shape is only known once the kernel computes it.
+      bool shape_matched = true;
+
       if (p_ort_value->IsTensor()) {
+        ORT_RETURN_IF_NOT(shape != nullptr, "shape must not be null for tensor output that is already allocated");
         const Tensor& tensor = p_ort_value->Get<Tensor>();
-        ORT_ENFORCE(shape && tensor.Shape() == *shape,
-                    "OrtValue shape verification failed. Current shape:", tensor.Shape(),
-                    " Requested shape:", shape ? shape->ToString() : "null");
+        shape_matched = (tensor.Shape() == *shape);
       } else if (p_ort_value->IsSparseTensor()) {
 #if !defined(DISABLE_SPARSE_TENSORS)
+        ORT_RETURN_IF_NOT(shape != nullptr, "shape must not be null for sparse tensor output that is already allocated");
         const SparseTensor& sp_tensor = p_ort_value->Get<SparseTensor>();
-        ORT_ENFORCE(shape && sp_tensor.DenseShape() == *shape,
-                    "OrtValue shape verification failed. Current shape:", sp_tensor.DenseShape(),
-                    " Requested shape:", shape ? shape->ToString() : "null");
+        shape_matched = (sp_tensor.DenseShape() == *shape);
+#endif
+      }
+
+      if (!shape_matched) {
+        const TensorShape& existing_shape = p_ort_value->IsTensor()
+                                                ? p_ort_value->Get<Tensor>().Shape()
+#if !defined(DISABLE_SPARSE_TENSORS)
+                                                : p_ort_value->Get<SparseTensor>().DenseShape();
+#else
+                                                : *shape;  // unreachable, but satisfies compiler
 #endif
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "The output OrtValue provided for output '",
+                               node.OutputDefs()[output_index]->Name(),
+                               "' of node '", node.Name(),
+                               "' (", node.OpType(), ") has shape ", existing_shape,
+                               " but the computed output shape for this run is ", *shape,
+                               ". When calling Run() with pre-allocated output OrtValues on a model "
+                               "with dynamic output shapes, either supply unallocated output OrtValues "
+                               "or ensure the pre-allocated shapes match the expected output shapes "
+                               "for each run.");
       }
     } else {
       // shape is nullptr for traditional ML output values
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
index 65ad60f478d78..27312a2f0ed37 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
+++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
@@ -9,7 +9,6 @@
 
 #include "core/common/common.h"
 #include "core/flatbuffers/schema/ort.fbs.h"
-#include "core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h"
 
 namespace onnxruntime::kernel_type_str_resolver_utils {
 
@@ -17,10 +16,6 @@ static constexpr auto* kStandaloneKernelTypeStrResolverFileIdentifier = "ktsr";
 
 #if !defined(ORT_MINIMAL_BUILD)
 
-gsl::span<const OpIdentifierWithStringViews> GetLayoutTransformationRequiredOpIdentifiers() {
-  return kLayoutTransformationPotentiallyAddedOps;
-}
-
 Status SaveKernelTypeStrResolverToBuffer(const KernelTypeStrResolver& kernel_type_str_resolver,
                                          flatbuffers::DetachedBuffer& buffer, gsl::span<const uint8_t>& buffer_span) {
   flatbuffers::FlatBufferBuilder builder;
@@ -53,368 +48,377 @@ Status AddLayoutTransformationRequiredOpsToKernelTypeStrResolver(KernelTypeStrRe
   // clang-format off
   constexpr uint8_t kLayoutTransformationRequiredOpsKernelTypeStrResolverBytes[] = {
       0x10, 0x00, 0x00, 0x00, 0x6b, 0x74, 0x73, 0x72, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00,
-      0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x32, 0x00, 0x00, 0x00, 0x7c, 0x15, 0x00, 0x00,
-      0xf8, 0x0a, 0x00, 0x00, 0x64, 0x04, 0x00, 0x00, 0x44, 0x14, 0x00, 0x00, 0xc0, 0x09, 0x00, 0x00,
-      0xbc, 0x14, 0x00, 0x00, 0x78, 0x0b, 0x00, 0x00, 0x08, 0x0c, 0x00, 0x00, 0xbc, 0x13, 0x00, 0x00,
-      0x44, 0x08, 0x00, 0x00, 0x94, 0x0d, 0x00, 0x00, 0xd8, 0x0d, 0x00, 0x00, 0xa0, 0x07, 0x00, 0x00,
-      0xe8, 0x06, 0x00, 0x00, 0x2c, 0x0a, 0x00, 0x00, 0x34, 0x0d, 0x00, 0x00, 0xdc, 0x07, 0x00, 0x00,
-      0x70, 0x12, 0x00, 0x00, 0x88, 0x06, 0x00, 0x00, 0x50, 0x0e, 0x00, 0x00, 0x4c, 0x01, 0x00, 0x00,
-      0xa0, 0x0c, 0x00, 0x00, 0x20, 0x11, 0x00, 0x00, 0x8c, 0x10, 0x00, 0x00, 0x38, 0x02, 0x00, 0x00,
-      0x88, 0x0f, 0x00, 0x00, 0x3c, 0x0f, 0x00, 0x00, 0x64, 0x05, 0x00, 0x00, 0x84, 0x11, 0x00, 0x00,
-      0xf4, 0x06, 0x00, 0x00, 0xf0, 0x08, 0x00, 0x00, 0x10, 0x0c, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
-      0x0c, 0x13, 0x00, 0x00, 0xc8, 0x0d, 0x00, 0x00, 0x44, 0x08, 0x00, 0x00, 0x20, 0x0a, 0x00, 0x00,
-      0xd4, 0x11, 0x00, 0x00, 0x84, 0x08, 0x00, 0x00, 0xe8, 0x05, 0x00, 0x00, 0x8c, 0x15, 0x00, 0x00,
-      0xd8, 0x0f, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x80, 0x01, 0x00, 0x00, 0x68, 0x05, 0x00, 0x00,
-      0x84, 0x0e, 0x00, 0x00, 0x8c, 0x04, 0x00, 0x00, 0x30, 0x04, 0x00, 0x00, 0x68, 0x02, 0x00, 0x00,
-      0x44, 0x12, 0x00, 0x00, 0x74, 0xea, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00,
-      0xa0, 0xea, 0xff, 0xff, 0x54, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xf0, 0xea, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xbc, 0xea, 0xff, 0xff,
-      0x54, 0x15, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xaa, 0xea, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xa4, 0xea, 0xff, 0xff,
-      0xe0, 0xea, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75,
-      0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x35, 0x00, 0x08, 0xeb, 0xff, 0xff, 0x08, 0x15, 0x00, 0x00,
+      0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x33, 0x00, 0x00, 0x00, 0x3c, 0x02, 0x00, 0x00,
+      0x88, 0x0a, 0x00, 0x00, 0x60, 0x15, 0x00, 0x00, 0x6c, 0x13, 0x00, 0x00, 0x64, 0x11, 0x00, 0x00,
+      0x7c, 0x04, 0x00, 0x00, 0xe4, 0x0e, 0x00, 0x00, 0xdc, 0x13, 0x00, 0x00, 0xb0, 0x02, 0x00, 0x00,
+      0x10, 0x0b, 0x00, 0x00, 0xf8, 0x14, 0x00, 0x00, 0x40, 0x06, 0x00, 0x00, 0xa0, 0x08, 0x00, 0x00,
+      0xa0, 0x10, 0x00, 0x00, 0x08, 0x0a, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00, 0xe4, 0x04, 0x00, 0x00,
+      0xb0, 0x0b, 0x00, 0x00, 0x40, 0x10, 0x00, 0x00, 0x14, 0x0e, 0x00, 0x00, 0xb0, 0x03, 0x00, 0x00,
+      0xe4, 0x02, 0x00, 0x00, 0x34, 0x0c, 0x00, 0x00, 0x18, 0x12, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00,
+      0x2c, 0x0f, 0x00, 0x00, 0x08, 0x05, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x48, 0x14, 0x00, 0x00,
+      0x44, 0x05, 0x00, 0x00, 0x70, 0x15, 0x00, 0x00, 0xa4, 0x0f, 0x00, 0x00, 0xe8, 0x07, 0x00, 0x00,
+      0x70, 0x09, 0x00, 0x00, 0x1c, 0x01, 0x00, 0x00, 0x9c, 0x10, 0x00, 0x00, 0xb0, 0x0b, 0x00, 0x00,
+      0xd8, 0x13, 0x00, 0x00, 0x1c, 0x03, 0x00, 0x00, 0x84, 0x05, 0x00, 0x00, 0x08, 0x09, 0x00, 0x00,
+      0x50, 0x0d, 0x00, 0x00, 0x10, 0x06, 0x00, 0x00, 0x60, 0x12, 0x00, 0x00, 0x58, 0x11, 0x00, 0x00,
+      0xd4, 0x0c, 0x00, 0x00, 0x64, 0x08, 0x00, 0x00, 0x4c, 0x0c, 0x00, 0x00, 0xdc, 0x0a, 0x00, 0x00,
+      0x60, 0x06, 0x00, 0x00, 0x9c, 0x15, 0x00, 0x00, 0xfc, 0xe9, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0x20, 0xea, 0xff, 0xff,
+      0x34, 0x15, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x0e, 0xea, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x48, 0xea, 0xff, 0xff,
+      0x44, 0xea, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+      0x40, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+      0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a,
+      0x32, 0x34, 0x00, 0x00, 0x78, 0xea, 0xff, 0xff, 0xa4, 0x15, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x54, 0xea, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x94, 0xea, 0xff, 0xff, 0x50, 0x15, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xb0, 0xea, 0xff, 0xff, 0xac, 0xea, 0xff, 0xff, 0x40, 0x11, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xf6, 0xea, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xf0, 0xea, 0xff, 0xff, 0x2c, 0xeb, 0xff, 0xff,
-      0xc8, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x7c, 0xeb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x48, 0xeb, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00,
-      0x34, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a,
-      0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x7c, 0xeb, 0xff, 0xff,
-      0x64, 0x13, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x58, 0xeb, 0xff, 0xff, 0x94, 0xeb, 0xff, 0xff, 0xf0, 0x0c, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe4, 0xeb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0xb0, 0xeb, 0xff, 0xff, 0x0c, 0x13, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9e, 0xeb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x0c, 0xec, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xd8, 0xeb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32,
-      0x33, 0x00, 0x00, 0x00, 0x04, 0xec, 0xff, 0xff, 0xf0, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x54, 0xec, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x20, 0xec, 0xff, 0xff, 0xf0, 0x13, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0e, 0xec, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x08, 0xec, 0xff, 0xff, 0x44, 0xec, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
-      0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e,
-      0x65, 0x61, 0x72, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x78, 0xec, 0xff, 0xff, 0x94, 0x12, 0x00, 0x00,
+      0x9a, 0xea, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x94, 0xea, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0xd4, 0xea, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f,
+      0x73, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0xfc, 0xea, 0xff, 0xff, 0x58, 0x14, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x66, 0xec, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xd4, 0xec, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
-      0xa0, 0xec, 0xff, 0xff, 0x40, 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x7c, 0xec, 0xff, 0xff, 0xb8, 0xec, 0xff, 0xff, 0x04, 0x12, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0xed, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0xd4, 0xec, 0xff, 0xff, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x07, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00,
-      0x58, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0xd8, 0x00, 0x00, 0x00,
-      0x1b, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66,
-      0x74, 0x3a, 0x51, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x76, 0x3a, 0x31, 0x00,
-      0x20, 0xed, 0xff, 0xff, 0x9c, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x74, 0xed, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,
-      0x7c, 0xed, 0xff, 0xff, 0x03, 0x00, 0x00, 0x00, 0x48, 0xed, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x34, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xa0, 0xed, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x6c, 0xed, 0xff, 0xff,
-      0x74, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xc0, 0xed, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x54, 0xed, 0xff, 0xff,
-      0x90, 0xed, 0xff, 0xff, 0xd8, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xe0, 0xed, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xac, 0xed, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x77, 0x5f, 0x73, 0x63,
-      0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0xee, 0xff, 0xff,
-      0x04, 0x00, 0x00, 0x00, 0xd4, 0xed, 0xff, 0xff, 0xb0, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xee, 0xff, 0xff, 0x06, 0x00, 0x00, 0x00,
-      0xf0, 0xed, 0xff, 0xff, 0x1c, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xde, 0xed, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x4c, 0xee, 0xff, 0xff, 0x07, 0x00, 0x00, 0x00, 0x18, 0xee, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
-      0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c,
-      0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x00, 0x00, 0x4c, 0xee, 0xff, 0xff,
-      0x94, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xa0, 0xee, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x34, 0xee, 0xff, 0xff,
-      0x70, 0xee, 0xff, 0xff, 0x4c, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x5e, 0xee, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xcc, 0xee, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x98, 0xee, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
-      0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x4e, 0x68,
-      0x77, 0x63, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x3a, 0x31, 0x00, 0xcc, 0xee, 0xff, 0xff,
-      0x44, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xba, 0xee, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb4, 0xee, 0xff, 0xff,
-      0xf0, 0xee, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x30, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e,
-      0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e,
-      0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00,
-      0x30, 0xef, 0xff, 0xff, 0xb0, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x84, 0xef, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
-      0x18, 0xef, 0xff, 0xff, 0x54, 0xef, 0xff, 0xff, 0x68, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0xef, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0xb0, 0xef, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x7c, 0xef, 0xff, 0xff,
+      0xea, 0xea, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x24, 0xeb, 0xff, 0xff, 0x20, 0xeb, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x32, 0x31,
+      0x00, 0x00, 0x00, 0x00, 0x48, 0xeb, 0xff, 0xff, 0xf8, 0x0e, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x36, 0xeb, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x70, 0xeb, 0xff, 0xff, 0x6c, 0xeb, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+      0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74,
+      0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x30, 0x00, 0x00, 0x00, 0x00,
+      0xa4, 0xeb, 0xff, 0xff, 0x60, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x8e, 0xeb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xc0, 0xeb, 0xff, 0xff,
+      0x8c, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x9c, 0xeb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xdc, 0xeb, 0xff, 0xff, 0x78, 0x13, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xbc, 0xeb, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x04, 0xec, 0xff, 0xff, 0x00, 0xec, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+      0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a,
+      0x31, 0x31, 0x00, 0x00, 0x28, 0xec, 0xff, 0xff, 0x38, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0xec, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x44, 0xec, 0xff, 0xff, 0x10, 0x13, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x32, 0xec, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x6c, 0xec, 0xff, 0xff, 0x68, 0xec, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+      0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a,
+      0x31, 0x39, 0x00, 0x00, 0x98, 0xec, 0xff, 0xff, 0x84, 0x13, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x86, 0xec, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x80, 0xec, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xc0, 0xec, 0xff, 0xff,
+      0x24, 0x13, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xa0, 0xec, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xe8, 0xec, 0xff, 0xff,
+      0xe4, 0xec, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f,
+      0x73, 0x65, 0x3a, 0x32, 0x35, 0x00, 0x00, 0x00, 0x0c, 0xed, 0xff, 0xff, 0x48, 0x12, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xfa, 0xec, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x34, 0xed, 0xff, 0xff, 0x30, 0xed, 0xff, 0xff,
+      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00,
+      0x3c, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61,
+      0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00,
+      0x64, 0xed, 0xff, 0xff, 0xb0, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x40, 0xed, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x80, 0xed, 0xff, 0xff,
+      0x9c, 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x6e, 0xed, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x68, 0xed, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0xa8, 0xed, 0xff, 0xff, 0x3c, 0x12, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc4, 0xed, 0xff, 0xff, 0xc0, 0xed, 0xff, 0xff,
+      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
+      0x64, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
+      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x34,
+      0x00, 0x00, 0x00, 0x00, 0xf8, 0xed, 0xff, 0xff, 0xf4, 0x0d, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe2, 0xed, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x14, 0xee, 0xff, 0xff, 0xd0, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf4, 0xed, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x3c, 0xee, 0xff, 0xff, 0x38, 0xee, 0xff, 0xff, 0xe4, 0x11, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x14, 0xee, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x54, 0xee, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
+      0x79, 0x3a, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00, 0x7c, 0xee, 0xff, 0xff, 0xc4, 0x0b, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x6a, 0xee, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xa4, 0xee, 0xff, 0xff, 0xa0, 0xee, 0xff, 0xff,
       0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-      0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00,
-      0xa0, 0xef, 0xff, 0xff, 0x70, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x8e, 0xef, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x88, 0xef, 0xff, 0xff, 0xc4, 0xef, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x00,
-      0xf0, 0xef, 0xff, 0xff, 0x20, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xde, 0xef, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xd8, 0xef, 0xff, 0xff, 0x14, 0xf0, 0xff, 0xff, 0xe0, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x64, 0xf0, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x30, 0xf0, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65,
-      0x7a, 0x65, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0x58, 0xf0, 0xff, 0xff, 0xb8, 0x0f, 0x00, 0x00,
+      0x0a, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x00, 0x00,
+      0xc4, 0xee, 0xff, 0xff, 0x90, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb2, 0xee, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xec, 0xee, 0xff, 0xff, 0xe8, 0xee, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x10, 0xef, 0xff, 0xff,
+      0x6c, 0x10, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xec, 0xee, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x2c, 0xef, 0xff, 0xff, 0x28, 0x10, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x46, 0xf0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x40, 0xf0, 0xff, 0xff, 0x7c, 0xf0, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x32, 0x35,
-      0x00, 0x00, 0x00, 0x00, 0xa4, 0xf0, 0xff, 0xff, 0xec, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x92, 0xf0, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x8c, 0xf0, 0xff, 0xff, 0xc8, 0xf0, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x36, 0x00, 0x00, 0x00, 0x00,
-      0xf0, 0xf0, 0xff, 0xff, 0xa0, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xde, 0xf0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xd8, 0xf0, 0xff, 0xff, 0x14, 0xf1, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x3c, 0xf1, 0xff, 0xff,
-      0xd4, 0x0e, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x2a, 0xf1, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x24, 0xf1, 0xff, 0xff,
-      0x60, 0xf1, 0xff, 0xff, 0x94, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xb0, 0xf1, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x7c, 0xf1, 0xff, 0xff,
+      0x1a, 0xef, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x54, 0xef, 0xff, 0xff, 0x50, 0xef, 0xff, 0xff,
       0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x34,
-      0x00, 0x00, 0x00, 0x00, 0xa4, 0xf1, 0xff, 0xff, 0xec, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x92, 0xf1, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x8c, 0xf1, 0xff, 0xff, 0xc8, 0xf1, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31,
+      0x00, 0x00, 0x00, 0x00, 0x78, 0xef, 0xff, 0xff, 0xdc, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x66, 0xef, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xa0, 0xef, 0xff, 0xff, 0x9c, 0xef, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00,
-      0xf0, 0xf1, 0xff, 0xff, 0xa0, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xde, 0xf1, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xd8, 0xf1, 0xff, 0xff, 0x14, 0xf2, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,
-      0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x3c, 0xf2, 0xff, 0xff,
-      0xa0, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x8c, 0xf2, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x58, 0xf2, 0xff, 0xff, 0xb8, 0x0d, 0x00, 0x00,
+      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x00,
+      0xc4, 0xef, 0xff, 0xff, 0x90, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb2, 0xef, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xec, 0xef, 0xff, 0xff, 0xe8, 0xef, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00,
+      0x14, 0xf0, 0xff, 0xff, 0x68, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xf0, 0xef, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x30, 0xf0, 0xff, 0xff,
+      0x24, 0x0f, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x1e, 0xf0, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x58, 0xf0, 0xff, 0xff,
+      0x54, 0xf0, 0xff, 0xff, 0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+      0x98, 0x00, 0x00, 0x00, 0xb8, 0x00, 0x00, 0x00, 0x04, 0x01, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+      0xd4, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+      0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x51, 0x4c,
+      0x69, 0x6e, 0x65, 0x61, 0x72, 0x43, 0x6f, 0x6e, 0x76, 0x3a, 0x31, 0x00, 0xa0, 0xf0, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x54, 0x34, 0x00, 0x00, 0x84, 0xf0, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
+      0xc4, 0xf0, 0xff, 0xff, 0x50, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xa0, 0xf0, 0xff, 0xff, 0x06, 0x00, 0x00, 0x00, 0xe0, 0xf0, 0xff, 0xff,
+      0x6c, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xbc, 0xf0, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xfc, 0xf0, 0xff, 0xff, 0xe8, 0x0e, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x46, 0xf2, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x40, 0xf2, 0xff, 0xff, 0x7c, 0xf2, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32,
-      0x31, 0x00, 0x00, 0x00, 0xa4, 0xf2, 0xff, 0xff, 0x6c, 0x0d, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x92, 0xf2, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x8c, 0xf2, 0xff, 0xff, 0xc8, 0xf2, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-      0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32, 0x35, 0x00, 0x00, 0x00,
-      0xf0, 0xf2, 0xff, 0xff, 0x20, 0x0d, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xde, 0xf2, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xd8, 0xf2, 0xff, 0xff, 0x14, 0xf3, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xdc, 0xf0, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x24, 0xf1, 0xff, 0xff, 0x20, 0xf1, 0xff, 0xff,
+      0xfc, 0x0e, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x00, 0xf1, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00, 0x08, 0xf1, 0xff, 0xff,
+      0x03, 0x00, 0x00, 0x00, 0x48, 0xf1, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x77, 0x5f, 0x73, 0x63,
+      0x61, 0x6c, 0x65, 0x00, 0x30, 0xf1, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x70, 0xf1, 0xff, 0xff,
+      0x7c, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x5e, 0xf1, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x58, 0xf1, 0xff, 0xff,
+      0x07, 0x00, 0x00, 0x00, 0x98, 0xf1, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
       0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x33, 0x00, 0x3c, 0xf3, 0xff, 0xff,
-      0xb8, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x8c, 0xf3, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x58, 0xf3, 0xff, 0xff, 0xb8, 0x0c, 0x00, 0x00,
+      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x35, 0x00, 0xc0, 0xf1, 0xff, 0xff,
+      0xbc, 0x0d, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x9c, 0xf1, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xdc, 0xf1, 0xff, 0xff, 0x78, 0x0d, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x46, 0xf3, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x40, 0xf3, 0xff, 0xff, 0x7c, 0xf3, 0xff, 0xff,
-      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,
-      0x64, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
-      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x33,
-      0x00, 0x00, 0x00, 0x00, 0xb4, 0xf3, 0xff, 0xff, 0x58, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9e, 0xf3, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xd0, 0xf3, 0xff, 0xff, 0x10, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xf4, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
-      0xb8, 0xf3, 0xff, 0xff, 0xf4, 0xf3, 0xff, 0xff, 0xc8, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0xf4, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x10, 0xf4, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
-      0x79, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x00, 0x00, 0x38, 0xf4, 0xff, 0xff, 0x58, 0x08, 0x00, 0x00,
+      0xca, 0xf1, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x04, 0xf2, 0xff, 0xff, 0x00, 0xf2, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x34,
+      0x00, 0x00, 0x00, 0x00, 0x28, 0xf2, 0xff, 0xff, 0x18, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0xf2, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x50, 0xf2, 0xff, 0xff, 0x4c, 0xf2, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+      0x20, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66,
+      0x74, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65,
+      0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0x8c, 0xf2, 0xff, 0xff, 0x90, 0x0d, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x26, 0xf4, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x20, 0xf4, 0xff, 0xff, 0x5c, 0xf4, 0xff, 0xff,
+      0x7a, 0xf2, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x74, 0xf2, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0xb4, 0xf2, 0xff, 0xff, 0x30, 0x0d, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x94, 0xf2, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0xdc, 0xf2, 0xff, 0xff, 0xd8, 0xf2, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73,
+      0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0x00, 0x00, 0x00, 0xf3, 0xff, 0xff,
+      0x54, 0x0c, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xee, 0xf2, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x28, 0xf3, 0xff, 0xff,
+      0x24, 0xf3, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f,
+      0x73, 0x65, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x00, 0x4c, 0xf3, 0xff, 0xff, 0x08, 0x0c, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x3a, 0xf3, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x74, 0xf3, 0xff, 0xff, 0x70, 0xf3, 0xff, 0xff,
       0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32,
-      0x33, 0x00, 0x00, 0x00, 0x84, 0xf4, 0xff, 0xff, 0x8c, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x72, 0xf4, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x6c, 0xf4, 0xff, 0xff, 0xa8, 0xf4, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x39,
+      0x00, 0x00, 0x00, 0x00, 0x98, 0xf3, 0xff, 0xff, 0xa8, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x86, 0xf3, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xc0, 0xf3, 0xff, 0xff, 0xbc, 0xf3, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
       0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74,
       0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x00,
-      0xe0, 0xf4, 0xff, 0xff, 0xb0, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xca, 0xf4, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xfc, 0xf4, 0xff, 0xff,
-      0x14, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x50, 0xf5, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xe4, 0xf4, 0xff, 0xff,
-      0x20, 0xf5, 0xff, 0xff, 0x48, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x70, 0xf5, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x3c, 0xf5, 0xff, 0xff,
-      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,
-      0x64, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
-      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x35,
-      0x00, 0x00, 0x00, 0x00, 0x74, 0xf5, 0xff, 0xff, 0x6c, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc8, 0xf5, 0xff, 0xff,
-      0x02, 0x00, 0x00, 0x00, 0x5c, 0xf5, 0xff, 0xff, 0x98, 0xf5, 0xff, 0xff, 0x74, 0x09, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x82, 0xf5, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0xb4, 0xf5, 0xff, 0xff, 0x08, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0xf6, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0xd0, 0xf5, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74,
-      0x68, 0x65, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0xf8, 0xf5, 0xff, 0xff, 0xe4, 0x07, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0xf6, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0x14, 0xf6, 0xff, 0xff, 0xfc, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xf6, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0xfc, 0xf5, 0xff, 0xff, 0x38, 0xf6, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
-      0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x34, 0x00,
-      0x60, 0xf6, 0xff, 0xff, 0x94, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xb0, 0xf6, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x7c, 0xf6, 0xff, 0xff,
-      0x94, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x6a, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x64, 0xf6, 0xff, 0xff,
-      0xa0, 0xf6, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x20, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61,
-      0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x39, 0x00, 0x00,
-      0xd0, 0xf6, 0xff, 0xff, 0x10, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xf7, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0xb8, 0xf6, 0xff, 0xff, 0xf4, 0xf6, 0xff, 0xff, 0xc8, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe2, 0xf6, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x50, 0xf7, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x1c, 0xf7, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x32, 0x31,
-      0x00, 0x00, 0x00, 0x00, 0x44, 0xf7, 0xff, 0xff, 0x4c, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x32, 0xf7, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x2c, 0xf7, 0xff, 0xff, 0x68, 0xf7, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
-      0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x00, 0x8c, 0xf7, 0xff, 0xff,
-      0x84, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x7a, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x74, 0xf7, 0xff, 0xff,
-      0xb0, 0xf7, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0xf4, 0xf3, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0xe6, 0xf3, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x18, 0xf4, 0xff, 0xff, 0x3c, 0x0b, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf8, 0xf3, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0x40, 0xf4, 0xff, 0xff, 0x3c, 0xf4, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+      0x78, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00, 0x24, 0xf4, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x64, 0xf4, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x34, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74,
+      0x68, 0x65, 0x72, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x8c, 0xf4, 0xff, 0xff, 0xd4, 0x08, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x68, 0xf4, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0xa8, 0xf4, 0xff, 0xff, 0xac, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x96, 0xf4, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xd0, 0xf4, 0xff, 0xff, 0xcc, 0xf4, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00,
+      0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x4e, 0x68,
+      0x77, 0x63, 0x4d, 0x61, 0x78, 0x50, 0x6f, 0x6f, 0x6c, 0x3a, 0x31, 0x00, 0x00, 0xf5, 0xff, 0xff,
+      0x54, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xee, 0xf4, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x28, 0xf5, 0xff, 0xff,
+      0x24, 0xf5, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
       0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
-      0x79, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x00, 0xd8, 0xf7, 0xff, 0xff, 0x38, 0x08, 0x00, 0x00,
+      0x79, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x00, 0x00, 0x4c, 0xf5, 0xff, 0xff, 0xf4, 0x04, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xc6, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xc0, 0xf7, 0xff, 0xff, 0xfc, 0xf7, 0xff, 0xff,
+      0x3a, 0xf5, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x74, 0xf5, 0xff, 0xff, 0x70, 0xf5, 0xff, 0xff,
       0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x31,
-      0x33, 0x00, 0x00, 0x00, 0x24, 0xf8, 0xff, 0xff, 0xec, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x12, 0xf8, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x0c, 0xf8, 0xff, 0xff, 0x48, 0xf8, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
-      0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a,
-      0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x30, 0x00, 0x00, 0x7c, 0xf8, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x79, 0x5f, 0x73, 0x63,
-      0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd8, 0xf8, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0xa4, 0xf8, 0xff, 0xff, 0x18, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x92, 0xf8, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x00, 0xf9, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xcc, 0xf8, 0xff, 0xff,
-      0x14, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xa8, 0xf8, 0xff, 0xff, 0xe4, 0xf8, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x35, 0x00, 0x00, 0x00,
-      0x10, 0xf9, 0xff, 0xff, 0x00, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xfe, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0xf8, 0xf8, 0xff, 0xff, 0x34, 0xf9, 0xff, 0xff, 0xc0, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x84, 0xf9, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0x50, 0xf9, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x14, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65,
-      0x3a, 0x31, 0x00, 0x00, 0x74, 0xf9, 0xff, 0xff, 0x9c, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x62, 0xf9, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x5c, 0xf9, 0xff, 0xff, 0x98, 0xf9, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00,
-      0x34, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a,
-      0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x35, 0x00, 0x00, 0xcc, 0xf9, 0xff, 0xff,
-      0x14, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xa8, 0xf9, 0xff, 0xff, 0xe4, 0xf9, 0xff, 0xff, 0x28, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd2, 0xf9, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x40, 0xfa, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x0c, 0xfa, 0xff, 0xff,
-      0xb0, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x5c, 0xfa, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x28, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
-      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31,
-      0x33, 0x00, 0x00, 0x00, 0x54, 0xfa, 0xff, 0xff, 0xbc, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0xfa, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x3c, 0xfa, 0xff, 0xff, 0x78, 0xfa, 0xff, 0xff, 0x7c, 0x01, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc8, 0xfa, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0x94, 0xfa, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x03, 0x00, 0x00, 0x00, 0x68, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
+      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32,
+      0x33, 0x00, 0x00, 0x00, 0x98, 0xf5, 0xff, 0xff, 0xbc, 0x09, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x86, 0xf5, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xc0, 0xf5, 0xff, 0xff, 0xbc, 0xf5, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
       0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e,
-      0x65, 0x61, 0x72, 0x3a, 0x32, 0x33, 0x00, 0x00, 0xc8, 0xfa, 0xff, 0xff, 0xf4, 0x03, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x18, 0xfb, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0xe4, 0xfa, 0xff, 0xff, 0x28, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd2, 0xfa, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x40, 0xfb, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x0c, 0xfb, 0xff, 0xff,
-      0xd4, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xe8, 0xfa, 0xff, 0xff, 0x24, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
-      0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a,
-      0x32, 0x31, 0x00, 0x00, 0x54, 0xfb, 0xff, 0xff, 0x68, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x42, 0xfb, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0xb0, 0xfb, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x7c, 0xfb, 0xff, 0xff,
-      0x64, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0xd0, 0xfb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x64, 0xfb, 0xff, 0xff,
-      0xa0, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75,
-      0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0xc8, 0xfb, 0xff, 0xff, 0x48, 0x04, 0x00, 0x00,
+      0x65, 0x61, 0x72, 0x3a, 0x32, 0x31, 0x00, 0x00, 0xec, 0xf5, 0xff, 0xff, 0xf8, 0x09, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xcc, 0xf5, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x14, 0xf6, 0xff, 0xff, 0x10, 0xf6, 0xff, 0xff,
+      0x0c, 0x0a, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xfe, 0xf5, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xf8, 0xf5, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0x38, 0xf6, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e,
+      0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x4e, 0x68, 0x77, 0x63, 0x46, 0x75,
+      0x73, 0x65, 0x64, 0x43, 0x6f, 0x6e, 0x76, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x70, 0xf6, 0xff, 0xff,
+      0xe4, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+      0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x6a, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x64, 0xf6, 0xff, 0xff, 0x03, 0x00, 0x00, 0x00,
+      0x6c, 0xf6, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x74, 0xf6, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0xbc, 0xf6, 0xff, 0xff, 0xb8, 0xf6, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x35, 0x00, 0x00, 0x00,
+      0xe4, 0xf6, 0xff, 0xff, 0x98, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xc0, 0xf6, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x00, 0xf7, 0xff, 0xff,
+      0x54, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xee, 0xf6, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x28, 0xf7, 0xff, 0xff,
+      0x24, 0xf7, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73,
+      0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x00, 0x00, 0x50, 0xf7, 0xff, 0xff,
+      0x2c, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x2c, 0xf7, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x6c, 0xf7, 0xff, 0xff, 0xe8, 0x07, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x5a, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x94, 0xf7, 0xff, 0xff, 0x90, 0xf7, 0xff, 0xff,
+      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+      0x20, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61,
+      0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x30, 0x00, 0x00,
+      0xc4, 0xf7, 0xff, 0xff, 0x58, 0x08, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb2, 0xf7, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xac, 0xf7, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xec, 0xf7, 0xff, 0xff, 0xf8, 0x07, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0xf8, 0xff, 0xff,
+      0x04, 0xf8, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x79, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00,
+      0xec, 0xf7, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x2c, 0xf8, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
+      0x3c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74,
+      0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x35, 0x00, 0x00, 0x00, 0x00,
+      0x64, 0xf8, 0xff, 0xff, 0xb8, 0x07, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x40, 0xf8, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x80, 0xf8, 0xff, 0xff,
+      0x6c, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x6a, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x9c, 0xf8, 0xff, 0xff, 0x48, 0x07, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xb6, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb0, 0xfb, 0xff, 0xff, 0xec, 0xfb, 0xff, 0xff,
-      0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x61, 0x78, 0x65, 0x73,
-      0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x48, 0xfc, 0xff, 0xff,
-      0x01, 0x00, 0x00, 0x00, 0x14, 0xfc, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x7c, 0xf8, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0xc4, 0xf8, 0xff, 0xff, 0xc0, 0xf8, 0xff, 0xff,
+      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+      0x60, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00, 0x3a, 0x51, 0x75, 0x61,
+      0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x35, 0x00, 0x00,
+      0xf4, 0xf8, 0xff, 0xff, 0xf8, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe2, 0xf8, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xdc, 0xf8, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x1c, 0xf9, 0xff, 0xff, 0xc8, 0x06, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x38, 0xf9, 0xff, 0xff,
+      0x34, 0xf9, 0xff, 0xff, 0xe8, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x10, 0xf9, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x50, 0xf9, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x34, 0x00, 0x00, 0x00,
+      0x14, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65,
+      0x3a, 0x32, 0x34, 0x00, 0x78, 0xf9, 0xff, 0xff, 0x04, 0x06, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x54, 0xf9, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x94, 0xf9, 0xff, 0xff, 0xc0, 0x05, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x82, 0xf9, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xbc, 0xf9, 0xff, 0xff, 0xb8, 0xf9, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65,
+      0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x32, 0x35, 0x00, 0x00, 0x00, 0x00, 0xe0, 0xf9, 0xff, 0xff,
+      0x60, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xce, 0xf9, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x08, 0xfa, 0xff, 0xff,
+      0x04, 0xfa, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
+      0x79, 0x3a, 0x31, 0x36, 0x00, 0x00, 0x00, 0x00, 0x2c, 0xfa, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x22, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x5c, 0xfa, 0xff, 0xff, 0x58, 0xfa, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
       0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61,
-      0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x00, 0x3c, 0xfc, 0xff, 0xff,
+      0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0x80, 0xfa, 0xff, 0xff,
+      0xd4, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x6e, 0xfa, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xa8, 0xfa, 0xff, 0xff,
+      0xa4, 0xfa, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+      0x44, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+      0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61,
+      0x72, 0x3a, 0x32, 0x33, 0x00, 0x00, 0x00, 0x00, 0xdc, 0xfa, 0xff, 0xff, 0x10, 0x01, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xc6, 0xfa, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xf8, 0xfa, 0xff, 0xff, 0xec, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd8, 0xfa, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0x20, 0xfb, 0xff, 0xff, 0x1c, 0xfb, 0xff, 0xff, 0x00, 0x05, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xf8, 0xfa, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x38, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x00,
+      0x64, 0xfb, 0xff, 0xff, 0x18, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x40, 0xfb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x80, 0xfb, 0xff, 0xff,
       0xd4, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x2a, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x24, 0xfc, 0xff, 0xff,
-      0x60, 0xfc, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74,
-      0x79, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x00, 0x00, 0x88, 0xfc, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x7e, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x78, 0xfc, 0xff, 0xff, 0xb4, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
-      0x63, 0x6f, 0x6d, 0x2e, 0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x51, 0x75,
-      0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00,
-      0xf0, 0xfc, 0xff, 0xff, 0xf0, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0xfd, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
-      0xd8, 0xfc, 0xff, 0xff, 0x14, 0xfd, 0xff, 0xff, 0xa8, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0xfd, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x70, 0xfd, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x3c, 0xfd, 0xff, 0xff,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x31,
-      0x00, 0x00, 0x00, 0x00, 0x64, 0xfd, 0xff, 0xff, 0xac, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x52, 0xfd, 0xff, 0xff,
-      0x00, 0x00, 0x00, 0x01, 0x4c, 0xfd, 0xff, 0xff, 0x88, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00,
-      0x0a, 0x00, 0x00, 0x00, 0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x31, 0x00, 0x00,
-      0xb0, 0xfd, 0xff, 0xff, 0x60, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x9e, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x98, 0xfd, 0xff, 0xff, 0xd4, 0xfd, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x54, 0x69, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x30, 0xfe, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xfc, 0xfd, 0xff, 0xff,
-      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
-      0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74,
-      0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0x00,
-      0x30, 0xfe, 0xff, 0xff, 0x8c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x1e, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
-      0x8c, 0xfe, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x58, 0xfe, 0xff, 0xff, 0x88, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x6e, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xa8, 0xfb, 0xff, 0xff,
+      0xa4, 0xfb, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+      0x70, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x12, 0x00, 0x00, 0x00,
+      0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a,
+      0x32, 0x33, 0x00, 0x00, 0xd8, 0xfb, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x54, 0x33, 0x00, 0x00, 0xce, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xc8, 0xfb, 0xff, 0xff,
+      0x02, 0x00, 0x00, 0x00, 0x08, 0xfc, 0xff, 0xff, 0x14, 0x04, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe4, 0xfb, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x24, 0xfc, 0xff, 0xff, 0xc0, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x40, 0xfc, 0xff, 0xff, 0x3c, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+      0x0d, 0x00, 0x00, 0x00, 0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32,
+      0x33, 0x00, 0x00, 0x00, 0x68, 0xfc, 0xff, 0xff, 0x14, 0x03, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x84, 0xfc, 0xff, 0xff, 0xd0, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x72, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xac, 0xfc, 0xff, 0xff, 0xa8, 0xfc, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+      0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61,
+      0x72, 0x3a, 0x32, 0x31, 0x00, 0x00, 0x00, 0x00, 0xdc, 0xfc, 0xff, 0xff, 0x40, 0x03, 0x00, 0x00,
       0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xac, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00, 0x40, 0xfe, 0xff, 0xff, 0x7c, 0xfe, 0xff, 0xff,
-      0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
-      0x24, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71,
-      0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x32, 0x34,
-      0x00, 0x00, 0x00, 0x00, 0xb4, 0xfe, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x54, 0x32, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0x0c, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0xd8, 0xfe, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x31, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x34, 0xff, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
-      0xc8, 0xfe, 0xff, 0xff, 0x04, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x54, 0x33, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xf6, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x28, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00,
-      0x48, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74,
-      0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x30, 0x00, 0x00, 0x00, 0x00,
-      0x60, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
-      0x78, 0x5f, 0x73, 0x63, 0x61, 0x6c, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
-      0xbc, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x88, 0xff, 0xff, 0xff, 0x08, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x7a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xac, 0xff, 0xff, 0xff,
-      0x64, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00,
-      0x02, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0xd8, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
-      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
-      0x3a, 0x55, 0x6e, 0x73, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x31, 0x00, 0x00, 0x00,
-      0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
-      0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
-      0x1c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00,
-      0x06, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xca, 0xfc, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xc4, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0xfd, 0xff, 0xff, 0xe0, 0x02, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xe4, 0xfc, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x2c, 0xfd, 0xff, 0xff, 0x28, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
+      0x3a, 0x47, 0x61, 0x74, 0x68, 0x65, 0x72, 0x3a, 0x31, 0x00, 0x00, 0x00, 0x50, 0xfd, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x54, 0x69, 0x6e, 0x64, 0x00, 0x00, 0x00, 0x00, 0x38, 0xfd, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x78, 0xfd, 0xff, 0xff, 0xdc, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x66, 0xfd, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0xa0, 0xfd, 0xff, 0xff, 0x9c, 0xfd, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00,
+      0x3a, 0x54, 0x72, 0x61, 0x6e, 0x73, 0x70, 0x6f, 0x73, 0x65, 0x3a, 0x32, 0x34, 0x00, 0x00, 0x00,
+      0xc4, 0xfd, 0xff, 0xff, 0x90, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xb2, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0xec, 0xfd, 0xff, 0xff, 0xe8, 0xfd, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x38, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x31, 0x33, 0x00, 0x10, 0xfe, 0xff, 0xff,
+      0x44, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0xfe, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0x38, 0xfe, 0xff, 0xff,
+      0x34, 0xfe, 0xff, 0xff, 0x48, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x10, 0xfe, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00, 0x50, 0xfe, 0xff, 0xff,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+      0x0b, 0x00, 0x00, 0x00, 0x3a, 0x49, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x3a, 0x31, 0x00,
+      0x74, 0xfe, 0xff, 0xff, 0xe0, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x62, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01,
+      0x9c, 0xfe, 0xff, 0xff, 0x98, 0xfe, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,
+      0x3a, 0x44, 0x65, 0x71, 0x75, 0x61, 0x6e, 0x74, 0x69, 0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61,
+      0x72, 0x3a, 0x31, 0x39, 0x00, 0x00, 0x00, 0x00, 0xcc, 0xfe, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0xba, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x01, 0xb4, 0xfe, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0xf4, 0xfe, 0xff, 0xff, 0xf0, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xd4, 0xfe, 0xff, 0xff, 0x02, 0x00, 0x00, 0x00,
+      0x1c, 0xff, 0xff, 0xff, 0x18, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+      0x3a, 0x53, 0x71, 0x75, 0x65, 0x65, 0x7a, 0x65, 0x3a, 0x32, 0x33, 0x00, 0x40, 0xff, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x36, 0xff, 0xff, 0xff,
+      0x00, 0x00, 0x00, 0x01, 0x70, 0xff, 0xff, 0xff, 0x6c, 0xff, 0xff, 0xff, 0x10, 0x00, 0x00, 0x00,
+      0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x61, 0x78, 0x65, 0x73, 0x00, 0x00, 0x00, 0x00, 0x54, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,
+      0x94, 0xff, 0xff, 0xff, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x2c, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x63, 0x6f, 0x6d, 0x2e,
+      0x6d, 0x69, 0x63, 0x72, 0x6f, 0x73, 0x6f, 0x66, 0x74, 0x3a, 0x51, 0x75, 0x61, 0x6e, 0x74, 0x69,
+      0x7a, 0x65, 0x4c, 0x69, 0x6e, 0x65, 0x61, 0x72, 0x3a, 0x31, 0x00, 0x00, 0xd0, 0xff, 0xff, 0xff,
+      0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,
+      0x0c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x54, 0x31, 0x00, 0x00, 0xb8, 0xff, 0xff, 0xff,
+      0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00,
+      0x04, 0x00, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+      0x54, 0x32, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,
+      0x00, 0x00, 0x00, 0x01, 0x08, 0x00, 0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00,
+      0x02, 0x00, 0x00, 0x00,
   };
   // clang-format on
 
diff --git a/onnxruntime/core/framework/kernel_type_str_resolver_utils.h b/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
index 5daab7c1159be..488970890a1a0 100644
--- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
+++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.h
@@ -8,8 +8,6 @@
 #include <gsl/gsl>
 #include "core/common/status.h"
 #include "core/framework/kernel_type_str_resolver.h"
-#include "core/graph/op_identifier.h"
-
 namespace flatbuffers {
 class DetachedBuffer;
 }
@@ -18,11 +16,6 @@ namespace onnxruntime::kernel_type_str_resolver_utils {
 
 #if !defined(ORT_MINIMAL_BUILD)
 
-/**
- * Gets the ops that the layout transformation may potentially add.
- */
-gsl::span<const OpIdentifierWithStringViews> GetLayoutTransformationRequiredOpIdentifiers();
-
 /**
  * Saves `kernel_type_str_resolver` to a byte buffer owned by `buffer` and referenced by `buffer_span`.
  */
diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc
index 212ce9c5069ea..287127a2a2ec9 100644
--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@@ -80,7 +80,7 @@ OrtValue* OpKernelContext::OutputMLValue(int index, const TensorShape& shape) {
 
   OrtValue* p_ml_value = nullptr;
   Status status = execution_frame_->GetOrCreateNodeOutputMLValue(index, GetOutputArgIndex(index), &shape, p_ml_value, kernel_->Node());
-  ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+  ORT_THROW_IF_ERROR(status);
   return p_ml_value;
 }
 
@@ -126,7 +126,7 @@ OrtValue* OpKernelContext::GetOrCreateOutputMLValue(int index) {
   auto output_arg_index = GetOutputArgIndex(index);
   OrtValue* value = nullptr;
   auto status = execution_frame_->GetOrCreateNodeOutputMLValue(index, output_arg_index, nullptr, value, kernel_->Node());
-  ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+  ORT_THROW_IF_ERROR(status);
   return value;
 }
 
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 2103183f8f452..c95d2d0d5ab8e 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -594,6 +594,11 @@ onnxruntime::Status ExecuteKernel(StreamExecutionContext& ctx,
 #endif
 #endif
     }
+    ORT_CATCH(const OnnxRuntimeException& ort_ex) {
+      ORT_HANDLE_EXCEPTION([&]() {
+        status = Status(ort_ex.Category(), ort_ex.Code(), ort_ex.what());
+      });
+    }
     ORT_CATCH(const std::exception& ex) {
       ORT_HANDLE_EXCEPTION([&]() {
         status = ORT_MAKE_STATUS(ONNXRUNTIME, RUNTIME_EXCEPTION, ex.what());
diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc
index ed130686f302a..6f73456742160 100644
--- a/onnxruntime/core/framework/tensorprotoutils.cc
+++ b/onnxruntime/core/framework/tensorprotoutils.cc
@@ -380,11 +380,11 @@ Status TensorProtoWithExternalDataToTensorProto(
   return Status::OK();
 }
 
-// Wraps std::filesystem::weakly_canonical with error_code handling.
+// Wraps Env::GetWeaklyCanonicalPath for std::filesystem::path.
 static Status WeaklyCanonicalPath(const std::filesystem::path& path, std::filesystem::path& result) {
-  std::error_code ec;
-  result = std::filesystem::weakly_canonical(path, ec);
-  ORT_RETURN_IF(ec, "Failed to get the weakly canonical path: ", path, " - ", ec.message());
+  PathString canonical_str;
+  ORT_RETURN_IF_ERROR(Env::Default().GetWeaklyCanonicalPath(path.native(), canonical_str));
+  result = std::filesystem::path(std::move(canonical_str));
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
index 8fe3a4d5f3b6f..cb015a3a3c500 100644
--- a/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
+++ b/onnxruntime/core/graph/contrib_ops/nhwc_schema_defs.cc
@@ -388,7 +388,6 @@ ONNX_MS_OPERATOR_SET_SCHEMA(NhwcFusedConv, 1,
                             OpSchema()
                                 .SetDoc(R"DOC(
 NhwcFusedConv is a Conv operator with optional activation and add operators fused in.
-Only has fp16 implementation as of 2023/04/15.
 )DOC")
                                 .Attr("auto_pad", "", AttributeProto::STRING, std::string("NOTSET"))
                                 .Attr("kernel_shape", "", AttributeProto::INTS, OPTIONAL_VALUE)
@@ -398,12 +397,27 @@ Only has fp16 implementation as of 2023/04/15.
                                 .Attr("group", "", AttributeProto::INT, static_cast<int64_t>(1))
                                 .Attr("activation", "", AttributeProto::STRING, OPTIONAL_VALUE)
                                 .Attr("activation_params", "", AttributeProto::FLOATS, OPTIONAL_VALUE)
-                                .Input(0, "X", "", "T")
-                                .Input(1, "W", "", "T")
-                                .Input(2, "B", "", "T", OpSchema::Optional)
-                                .Input(3, "Z", "Tensor to be added to the output, must be the same shape and format as the output tensor.", "T", OpSchema::Optional)
-                                .Output(0, "Y", "", "T")
-                                .TypeConstraint("T", {"tensor(float16)"}, "Constrain input and output types to float tensors")
+                                .Input(0, "X",
+                                       "Input activation tensor in channels-last layout. For 2D convolution this is "
+                                       "[N, H, W, C], where N is batch size, H/W are spatial dimensions, and C is "
+                                       "the number of input channels.",
+                                       "T")
+                                .Input(1, "W",
+                                       "Convolution weight tensor in the standard ONNX Conv filter layout "
+                                       "[M, C/group, kH, kW], where M is the number of output channels.",
+                                       "T")
+                                .Input(2, "B",
+                                       "Optional 1D bias tensor of shape [M].",
+                                       "T", OpSchema::Optional)
+                                .Input(3, "Z",
+                                       "Optional residual/add tensor in the same channels-last layout and shape as "
+                                       "the output tensor Y. For 2D convolution this is [N, out_H, out_W, M].",
+                                       "T", OpSchema::Optional)
+                                .Output(0, "Y",
+                                        "Output tensor in channels-last layout. For 2D convolution this is "
+                                        "[N, out_H, out_W, M], where M is the number of output channels.",
+                                        "T")
+                                .TypeConstraint("T", {"tensor(float16)", "tensor(float)"}, "Constrain input and output types to float tensors")
                                 .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
                                   ONNX_NAMESPACE::propagateElemTypeFromInputToOutput(ctx, 0, 0);
                                   convPoolShapeInferenceNhwc(ctx, true, false, 0, 1);
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 218aaef0c8f4b..ddb9daa5e244b 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -892,6 +892,7 @@ struct MLAS_CONV_PARAMETERS {
     size_t BatchCount;
     size_t GroupCount;
     size_t InputChannels;
+    bool ChannelsLast;
     size_t InputShape[3];
     size_t KernelShape[3];
     size_t DilationShape[3];
@@ -906,6 +907,9 @@ struct MLAS_CONV_PARAMETERS {
     MLAS_CONV_ALGORITHM Algorithm;
     ptrdiff_t ThreadCount;
     const MLAS_BACKEND_KERNEL_SELECTOR_CONFIG* BackendKernelSelectorConfig = nullptr;
+    const void* PackedFilter = nullptr;
+    size_t PackedFilterGroupStride = 0;
+    bool FilterIsPacked = false;
     union {
         struct {
             CBLAS_TRANSPOSE TransB;
@@ -932,9 +936,24 @@ MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
                 size_t FilterCount,
                 const MLAS_ACTIVATION* Activation,
                 size_t* WorkingBufferSize,
+                bool ChannelsLast,
                 float Beta,
                 MLAS_THREADPOOL* ThreadPool);
 
+bool
+MLASCALL
+MlasConvSupportsSymmetricChannelsLast2DFloatKernel(
+    size_t Dimensions,
+    size_t BatchCount,
+    size_t GroupCount,
+    const size_t* InputShape,
+    const size_t* KernelShape,
+    const size_t* DilationShape,
+    const size_t* Padding,
+    const size_t* StrideShape,
+    size_t FilterCount,
+    float Beta);
+
 void
 MLASCALL
 MlasConv(
diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index ba2d98f46e960..696314f20267e 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1324,6 +1324,85 @@ Return Value:
 // Chance of arithmetic overflow could be reduced
 #pragma warning(disable : 26451)
 #endif
+
+namespace {
+
+#if defined(USE_KLEIDIAI) && defined(MLAS_TARGET_ARM64)
+static constexpr size_t ComputeChannelsLastDilatedKernelSize(size_t dilation, size_t kernel) {
+    return (dilation * kernel) - (dilation - 1);
+}
+
+static constexpr size_t ComputeChannelsLastConvOutSize(size_t input, size_t kernel, size_t padding, size_t stride) {
+    if (stride > 0 && (input + 2 * padding) >= kernel) {
+        return (((input - kernel) + (2 * padding)) / stride) + 1;
+    }
+
+    return 0;
+}
+#endif
+
+}  // namespace
+
+bool
+MLASCALL
+MlasConvSupportsSymmetricChannelsLast2DFloatKernel(
+    size_t Dimensions,
+    size_t BatchCount,
+    size_t GroupCount,
+    const size_t* InputShape,
+    const size_t* KernelShape,
+    const size_t* DilationShape,
+    const size_t* Padding,
+    const size_t* StrideShape,
+    size_t FilterCount,
+    float Beta)
+{
+#if !defined(USE_KLEIDIAI) || !defined(MLAS_TARGET_ARM64)
+    MLAS_UNREFERENCED_PARAMETER(Dimensions);
+    MLAS_UNREFERENCED_PARAMETER(BatchCount);
+    MLAS_UNREFERENCED_PARAMETER(GroupCount);
+    MLAS_UNREFERENCED_PARAMETER(InputShape);
+    MLAS_UNREFERENCED_PARAMETER(KernelShape);
+    MLAS_UNREFERENCED_PARAMETER(DilationShape);
+    MLAS_UNREFERENCED_PARAMETER(Padding);
+    MLAS_UNREFERENCED_PARAMETER(StrideShape);
+    MLAS_UNREFERENCED_PARAMETER(FilterCount);
+    MLAS_UNREFERENCED_PARAMETER(Beta);
+    return false;
+#else
+    // Channels-last float convolution is only implemented by the KleidiAI
+    // override. The generic MLAS convolution path assumes NCHW layout.
+    if (GetMlasPlatform().MlasConvPrepareOverride == nullptr ||
+        GetMlasPlatform().MlasConvOverride == nullptr) {
+        return false;
+    }
+
+    if (Dimensions != 2 || BatchCount != 1 || GroupCount != 1 || Beta != 0.0f) {
+        return false;
+    }
+
+    if (Padding[0] != Padding[2] || Padding[1] != Padding[3]) {
+        return false;
+    }
+
+    const size_t output_h =
+        ComputeChannelsLastConvOutSize(InputShape[0], ComputeChannelsLastDilatedKernelSize(DilationShape[0], KernelShape[0]),
+                                       Padding[0], StrideShape[0]);
+    const size_t output_w =
+        ComputeChannelsLastConvOutSize(InputShape[1], ComputeChannelsLastDilatedKernelSize(DilationShape[1], KernelShape[1]),
+                                       Padding[1], StrideShape[1]);
+    if (output_h == 0 || output_w == 0) {
+        return false;
+    }
+
+    if (FilterCount <= 1 || KernelShape[0] < 3 || KernelShape[1] < 3) {
+        return false;
+    }
+
+    return true;
+#endif
+}
+
 void
 MLASCALL
 MlasConvPrepare(
@@ -1341,6 +1420,7 @@ MlasConvPrepare(
     size_t FilterCount,
     const MLAS_ACTIVATION* Activation,
     size_t* WorkingBufferSize,
+    bool ChannelsLast,
     float Beta,
     MLAS_THREADPOOL* ThreadPool
     )
@@ -1399,7 +1479,7 @@ Return Value:
     if (GetMlasPlatform().MlasConvPrepareOverride != nullptr &&
         GetMlasPlatform().MlasConvPrepareOverride(Parameters, Dimensions, BatchCount, GroupCount, InputChannels,
         InputShape,KernelShape,DilationShape, Padding, StrideShape, OutputShape, FilterCount,
-        Activation, WorkingBufferSize, Beta, ThreadPool)){
+        Activation, WorkingBufferSize, ChannelsLast, Beta, ThreadPool)){
         return;
     }
     //
@@ -1410,6 +1490,7 @@ Return Value:
     Parameters->BatchCount = BatchCount;
     Parameters->GroupCount = GroupCount;
     Parameters->InputChannels = InputChannels;
+    Parameters->ChannelsLast = ChannelsLast;
     Parameters->FilterCount = FilterCount;
     Parameters->Beta = Beta;
 
diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
index a37dcb37a17ac..cca4f5a19c417 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
@@ -24,20 +24,6 @@
 const KaiF32IMatmulKernel& imatmul_conv = GetKleidiAIF32IMatmulUKernel();
 
 
-// Right-hand-side (weights) cache key
-struct RhsCacheKey {
-    size_t co, ci, kh, kw, dilationh, dilationw;
-    size_t weights_hash;
-
-    bool operator==(const RhsCacheKey& other) const {
-        return co == other.co && ci == other.ci &&
-               kh == other.kh && kw == other.kw &&
-               dilationh == other.dilationh && dilationw == other.dilationw &&
-               weights_hash == other.weights_hash;
-    }
-};
-
-
 // Left-hand-side (input indirection) cache key
 struct LhsCacheKey {
     size_t ci, ih, iw;
@@ -53,90 +39,29 @@ struct LhsCacheKey {
     }
 };
 
-// Derived from 2^32 * (sqrt(5) - 1) / 2 ≈ 0.6180339887 (reciprocal of the golden ratio)
-// Based on Knuth's multiplicative hashing method
-constexpr size_t HASH_GOLDEN_RATIO_CONST = 0x9e3779b9;
-
-size_t HashWeights(const float* data, size_t count = 16) {
-    size_t h = 0;
-    for (size_t i = 0; i < count; ++i) {
-        h ^= std::hash<float>()(data[i]) + HASH_GOLDEN_RATIO_CONST + (h << 6) + (h >> 2);
-    }
-    return h;
-}
-
 namespace std {
     // Specialize hash type for cache keys and do it within namespace std.
     // Doing this allows standard containers like std::unordered_map to find
     // the appropriate hash function via template specialization, as ADL
     // (argument-dependent lookup) does not apply to std::hash.
-    template<>
-    struct hash<RhsCacheKey> {
-        size_t operator()(const RhsCacheKey& k) const {
-            return k.weights_hash ^
-                (std::hash<size_t>()(k.co) << 1) ^
-                (std::hash<size_t>()(k.ci) << 2) ^
-                (std::hash<size_t>()(k.kh) << 3) ^
-                (std::hash<size_t>()(k.kw) << 4) ^
-                (std::hash<size_t>()(k.dilationh) << 5) ^
-                (std::hash<size_t>()(k.dilationw) << 6);
-        }
-    };
-
     template<>
     struct hash<LhsCacheKey> {
         size_t operator()(const LhsCacheKey& k) const {
-            return (std::hash<size_t>()(k.ci) << 1) ^
-                (std::hash<size_t>()(k.ih) << 2) ^
-                (std::hash<size_t>()(k.iw) << 3) ^
-                (std::hash<size_t>()(k.padding) << 4) ^
-                (std::hash<size_t>()(k.sh) << 5) ^
-                (std::hash<size_t>()(k.sw) << 6) ^
-                (std::hash<size_t>()(k.kh) << 7) ^
-                (std::hash<size_t>()(k.kw) << 8) ^
-                (std::hash<size_t>()(k.dilationh) << 9) ^
-                (std::hash<size_t>()(k.dilationw) << 10);
+            return std::hash<size_t>()(k.ci) ^
+                (std::hash<size_t>()(k.ih) << 1) ^
+                (std::hash<size_t>()(k.iw) << 2) ^
+                (std::hash<size_t>()(k.padding) << 3) ^
+                (std::hash<size_t>()(k.sh) << 4) ^
+                (std::hash<size_t>()(k.sw) << 5) ^
+                (std::hash<size_t>()(k.kh) << 6) ^
+                (std::hash<size_t>()(k.kw) << 7) ^
+                (std::hash<size_t>()(k.dilationh) << 8) ^
+                (std::hash<size_t>()(k.dilationw) << 9);
         }
     };
 
 }
 
-namespace {
-
-using LhsPtrsCache = std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>>;
-
-thread_local std::unordered_map<const float*, LhsPtrsCache> lhs_ptrs_cache_by_pad;
-thread_local const float* last_pad_ptr = nullptr;
-
-size_t LhsPtrsCacheEntryCount() {
-    size_t count = 0;
-    for (const auto& cache_group : lhs_ptrs_cache_by_pad) {
-        count += cache_group.second.size();
-    }
-    return count;
-}
-
-void ClearLhsPtrsCache() {
-    lhs_ptrs_cache_by_pad.clear();
-    last_pad_ptr = nullptr;
-}
-
-}  // namespace
-
-#if defined(MLAS_ENABLE_TEST_HOOKS)
-size_t
-MLASCALL
-ArmKleidiAI::MlasConvLhsCacheEntryCountForTest() {
-    return LhsPtrsCacheEntryCount();
-}
-
-void
-MLASCALL
-ArmKleidiAI::MlasConvClearLhsCacheForTest() {
-    ClearLhsPtrsCache();
-}
-#endif
-
 
 static constexpr size_t ComputeKernelSize(const size_t D, const size_t K) {
     // D - dilation size
@@ -184,29 +109,21 @@ static size_t ComputeMlasWorkingBufferSize(const size_t co,
 }
 
 static bool CheckCapabilitiesSme(const MLAS_CONV_PARAMETERS* Parameters) {
-
-    //functional checks - logically can the conv be performed
-    if ((Parameters->Dimensions != 2) ||
-        (Parameters->BatchCount != 1) ||
-        (Parameters->Beta != 0.f) ||
-        (Parameters->Padding[0] != Parameters->Padding[1]) ||
-        (Parameters->Padding[0] != Parameters->Padding[2]) ||
-        (Parameters->Padding[0] != Parameters->Padding[3]) ||
-        (ComputeConvOutSize(Parameters->InputShape[0],
-                            ComputeKernelSize(Parameters->DilationShape[0],Parameters->KernelShape[0]),
-                            Parameters->Padding[0], Parameters->StrideShape[0]) *
-         ComputeConvOutSize(Parameters->InputShape[1],
-                            ComputeKernelSize(Parameters->DilationShape[1],Parameters->KernelShape[1]),
-                            Parameters->Padding[1], Parameters->StrideShape[1]) == 0)) {
-        KLEIDIAI_DEBUG_LOG("CheckCapabilitiesSme returning false on functional checks.");
+    if (!MlasConvSupportsSymmetricChannelsLast2DFloatKernel(
+            Parameters->Dimensions,
+            Parameters->BatchCount,
+            Parameters->GroupCount,
+            Parameters->InputShape,
+            Parameters->KernelShape,
+            Parameters->DilationShape,
+            Parameters->Padding,
+            Parameters->StrideShape,
+            Parameters->FilterCount,
+            Parameters->Beta)) {
+        KLEIDIAI_DEBUG_LOG("CheckCapabilitiesSme returning false on shared capability checks.");
         return false;
     }
 
-    auto N = Parameters->FilterCount;
-    if (N == 1 || Parameters->KernelShape[0] < 3 || Parameters->KernelShape[1] < 3) {
-        KLEIDIAI_DEBUG_LOG("CheckCapabilitiesSme returning false on optimization checks.");
-        return false;
-    }
     return true;
 }
 
@@ -365,52 +282,65 @@ static void MultiThreadedLHSPackSme(MLAS_THREADPOOL* ThreadPool, const size_t ci
     });
 }
 
-static std::shared_ptr<std::byte[]> RhsPackWeightsBiasSme(const size_t co, const size_t ci,
-                                                          const size_t kh, const size_t kw,
-                                                          const size_t dilationh, const size_t dilationw,
-                                                          const float* weights, const float* bias,
-                                                          MLAS_THREADPOOL* ThreadPool)
-{
-    // Cache of prepacked kai rhs weights and biases. thread_local to prevent interference from parallel sessions.
-    thread_local std::unordered_map<RhsCacheKey, std::shared_ptr<std::byte[]>> rhs_cache;
+size_t
+MLASCALL
+ArmKleidiAI::MlasConvSymmetricChannelsLast2DFloatPackWSize(
+    size_t FilterCount,
+    size_t InputChannels,
+    const int64_t* KernelShape,
+    const int64_t* DilationShape) {
+    const auto d_kh = ComputeKernelSize(static_cast<size_t>(DilationShape[0]), static_cast<size_t>(KernelShape[0]));
+    const auto d_kw = ComputeKernelSize(static_cast<size_t>(DilationShape[1]), static_cast<size_t>(KernelShape[1]));
+    return kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(FilterCount, d_kh * d_kw,
+                                                                                InputChannels);
+}
+
+void
+MLASCALL
+ArmKleidiAI::MlasConvSymmetricChannelsLast2DFloatPackW(
+    size_t FilterCount,
+    size_t InputChannels,
+    const int64_t* KernelShape,
+    const int64_t* DilationShape,
+    size_t GroupCount,
+    const float* Filter,
+    const float* Bias,
+    void* PackedFilter,
+    size_t PackedFilterGroupStride,
+    MLAS_THREADPOOL* ThreadPool) {
+    const size_t kh = static_cast<size_t>(KernelShape[0]);
+    const size_t kw = static_cast<size_t>(KernelShape[1]);
+    const size_t dilationh = static_cast<size_t>(DilationShape[0]);
+    const size_t dilationw = static_cast<size_t>(DilationShape[1]);
+    const auto d_kh = ComputeKernelSize(dilationh, kh);
+    const auto d_kw = ComputeKernelSize(dilationw, kw);
 
-    RhsCacheKey key = { co, ci, kh, kw, dilationh, dilationw, HashWeights(weights) };
+    for (size_t group_idx = 0; group_idx < GroupCount; ++group_idx) {
+        const float* weights = Filter + group_idx * FilterCount * InputChannels * kh * kw;
+        const float* bias = Bias ? Bias + group_idx * FilterCount : nullptr;
+        auto* packed_group = reinterpret_cast<std::byte*>(PackedFilter) + group_idx * PackedFilterGroupStride;
 
-    auto found = rhs_cache.find(key);
-    if (found != rhs_cache.end()) {
-        return found->second;
-    } else {
         // prepare mlas filter weights for kai rhs packing
         // dilated nhwc format
-        auto nhwc = NChwToNhwc(co, ci, kh, kw, weights, dilationh, dilationw, true, ThreadPool);
-
-
-        //dilation, axis swap (n x k -> k x n) where n == co, k == d_kh x d_kw x ci
-        const auto d_kh = ComputeKernelSize(dilationh,kh);
-        const auto d_kw = ComputeKernelSize(dilationw,kw);
+        auto nhwc = NChwToNhwc(FilterCount, InputChannels, kh, kw, weights, dilationh, dilationw, true, ThreadPool);
 
         //t_weights[d_kh][d_kw][ci][co] = nhwc[co][d_kh][d_kw][ci]
-        auto t_weights = Transpose4D({co,d_kh,d_kw,ci},&nhwc[0],{1,2,3,0});
-
-        const auto packed_size = kai_get_rhs_packed_size_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(co,d_kh*d_kw,ci);
-        auto packed = std::shared_ptr<std::byte[]>(new std::byte[packed_size], std::default_delete<std::byte[]>());
-
-        rhs_cache[key] = packed;
+        auto t_weights = Transpose4D({FilterCount, d_kh, d_kw, InputChannels}, &nhwc[0], {1, 2, 3, 0});
 
         std::vector<float> bias_copy;
         if (bias) {
-            bias_copy.assign(bias, bias + co);
+            bias_copy.assign(bias, bias + FilterCount);
         } else {
-            bias_copy.resize(co, 0.0f);
+            bias_copy.resize(FilterCount, 0.0f);
         }
 
         KLEIDIAI_KERNEL_LOG("kai_run_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme"
-                            << " N=" << co << " k_chunk_count=" << (d_kh*d_kw) << " k_chunk_length=" << ci << " rhs_stride_row=" << (co * sizeof(float)));
+                            << " N=" << FilterCount << " k_chunk_count=" << (d_kh * d_kw)
+                            << " k_chunk_length=" << InputChannels
+                            << " rhs_stride_row=" << (FilterCount * sizeof(float)));
         kai_run_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme(
-            co, d_kh*d_kw, ci, co * sizeof(float), &t_weights[0], bias_copy.data(), packed.get()
-        );
-
-        return packed;
+            FilterCount, d_kh * d_kw, InputChannels, FilterCount * sizeof(float), &t_weights[0], bias_copy.data(),
+            packed_group);
     }
 }
 
@@ -491,6 +421,7 @@ static std::shared_ptr<const void*[]> LhsPtrFill(const size_t ci, const size_t i
 static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const size_t ih, const size_t iw,
                                                         const size_t kh, const size_t kw, const size_t sh,
                                                         const size_t sw, const size_t padding, const float* in,
+                                                        bool input_is_channels_last,
                                                         MLAS_THREADPOOL* ThreadPool)
 {
     size_t padsize = 256;
@@ -517,21 +448,31 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
     const auto lhs_size = kai_get_lhs_packed_size_lhs_imatmul_pack_x32p2vlx1_x32p_sme(m,kh*kw,ci);
     auto lhs = std::make_unique<std::byte[]>(lhs_size);
 
-    auto nhwc = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool);
+    std::unique_ptr<float[]> nhwc_holder;
+    const float* activation_src = nullptr;
+    if (input_is_channels_last) {
+        activation_src = in;
+    } else {
+        nhwc_holder = NChwToNhwc(1, ci, ih, iw, in, 1, 1, false, ThreadPool);
+        activation_src = nhwc_holder.get();
+    }
 
     // Cache of computed lhs ptr offsets. thread_local to prevent interference from parallel sessions.
     //
     // Entries include pointers to the pad buffer for out-of-bounds pixels, so we must not reuse entries after the
     // pad buffer is reallocated. To avoid clearing the entire cache, we group caches by pad buffer identity and
     // invalidate only the old group when the pad buffer moves.
+    using LhsPtrsCache = std::unordered_map<LhsCacheKey, std::shared_ptr<const void*[]>>;
+    thread_local std::unordered_map<const float*, LhsPtrsCache> lhs_ptrs_cache_by_pad;
+
     // If pad_ptr moved (vector reallocation), drop only the old group to avoid accumulating unreachable entries.
+    thread_local const float* last_pad_ptr = nullptr;
     const float* cur_pad_ptr = pad_ptr.data();
     if (last_pad_ptr != nullptr && last_pad_ptr != cur_pad_ptr) {
         lhs_ptrs_cache_by_pad.erase(last_pad_ptr);
     }
     last_pad_ptr = cur_pad_ptr;
 
-    // LhsPtrFill stores geometry offsets only; the current input base is supplied when packing.
     LhsCacheKey key = {
         ci, ih, iw,
         padding, sh, sw,
@@ -549,7 +490,7 @@ static std::unique_ptr<std::byte[]> LhsPackImageDataSme(const size_t ci, const s
         lhs_ptrs_cache[key] = lhs_ptrs;
     }
 
-    MultiThreadedLHSPackSme(ThreadPool, ci, m, kh, kw, &lhs_ptrs[0], &lhs[0], &nhwc[0], &pad_ptr[0]);
+    MultiThreadedLHSPackSme(ThreadPool, ci, m, kh, kw, &lhs_ptrs[0], &lhs[0], activation_src, &pad_ptr[0]);
 
     return lhs;
 }
@@ -568,9 +509,12 @@ static void ConvolveSme(const size_t co, //channels out
                         const size_t groups,       //number of filter groups
                         const float* weights,      //kernel weights [co,ci,ih,iw]
                         const float* bias,         //kernel biases
+                        const std::byte* packed_rhs,
+                        const size_t packed_rhs_group_stride,
                         const float* in,           //in image data
                         float* out,                //out image data
                         float* tmp_mlas_aligned,   //intermediate buffer if we need to perform a transpose
+                        bool input_is_channels_last,
                         MLAS_THREADPOOL* ThreadPool) {
 
     //RhsPackWeightsBiasSme() - to perform dilation increases kernel size and masks unused weights
@@ -608,18 +552,27 @@ static void ConvolveSme(const size_t co, //channels out
 
     for (size_t g = 0; g < groups; ++g) {
 
-        auto result{out};
-        //do we require a post matmul transpose ?
-        //output is m x n or image_data x co or hw x co
-        //MLAS require it as n x m (or co x hw), transpose required
-        if (co > 1) {
-            //intermediate buffer required, pre-transpose
-            //Note: because we are calling MlasTranspose() need to ensure we use a MLAS aligned buffer
+        auto result = out;
+        const bool need_transpose = (!input_is_channels_last) && (co > 1);
+        if (need_transpose) {
             result = tmp_mlas_aligned;
         }
 
-        auto lhs = LhsPackImageDataSme(ci, ih, iw, d_kh, d_kw, sh, sw, padding, in, ThreadPool);
-        auto rhs = RhsPackWeightsBiasSme(co, ci, kh, kw, dilationh, dilationw, weights, bias, ThreadPool);
+        auto lhs = LhsPackImageDataSme(ci, ih, iw, d_kh, d_kw, sh, sw, padding, in, input_is_channels_last, ThreadPool);
+        const std::byte* rhs_data = packed_rhs ? packed_rhs + g * packed_rhs_group_stride : nullptr;
+        std::unique_ptr<std::byte[]> rhs_storage;
+        if (rhs_data == nullptr) {
+            const std::array<int64_t, 2> kernel_shape{static_cast<int64_t>(kh), static_cast<int64_t>(kw)};
+            const std::array<int64_t, 2> dilation_shape{static_cast<int64_t>(dilationh), static_cast<int64_t>(dilationw)};
+            const size_t packed_size =
+                ArmKleidiAI::MlasConvSymmetricChannelsLast2DFloatPackWSize(co, ci, kernel_shape.data(),
+                                                                           dilation_shape.data());
+            rhs_storage = std::make_unique<std::byte[]>(packed_size);
+            ArmKleidiAI::MlasConvSymmetricChannelsLast2DFloatPackW(co, ci, kernel_shape.data(), dilation_shape.data(), 1,
+                                                                   weights, bias, rhs_storage.get(), packed_size,
+                                                                   ThreadPool);
+            rhs_data = rhs_storage.get();
+        }
 
         MlasTrySimpleParallel(ThreadPool, static_cast<ptrdiff_t>(dim[0] * dim[1] * dim[2]), [&](ptrdiff_t tid) {
             //compute B,M,N index from iteration index
@@ -632,7 +585,7 @@ static void ConvolveSme(const size_t co, //channels out
                 imatmul_conv.ukernel.get_rhs_packed_offset(NIdx * n_step, d_kh * d_kw, ci);
 
             auto BTile = reinterpret_cast<const void*>(
-                reinterpret_cast<const std::byte*>(rhs.get()) + rhs_packed_offset
+                rhs_data + rhs_packed_offset
             );
 
             // Get lhs tile, A
@@ -660,7 +613,7 @@ static void ConvolveSme(const size_t co, //channels out
             );
         });
 
-        if (result == tmp_mlas_aligned) {
+        if (need_transpose) {
             //Note: this could be absorbed into post conv activation
             MlasTranspose(tmp_mlas_aligned, out, m, co, ThreadPool);
         }
@@ -689,6 +642,7 @@ ArmKleidiAI::MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
                 size_t FilterCount,
                 const MLAS_ACTIVATION* Activation,
                 size_t* WorkingBufferSize,
+                bool ChannelsLast,
                 float Beta,
                 MLAS_THREADPOOL* ThreadPool)
 {
@@ -708,6 +662,7 @@ ArmKleidiAI::MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
     Parameters->BatchCount = BatchCount;
     Parameters->GroupCount = GroupCount;
     Parameters->InputChannels = InputChannels;
+    Parameters->ChannelsLast = ChannelsLast;
     Parameters->FilterCount = FilterCount;
     Parameters->Beta = Beta;
 
@@ -779,7 +734,10 @@ ArmKleidiAI::MlasConv(
                 Parameters->DilationShape[0], Parameters->DilationShape[1],  // kernel dilation
                 Parameters->Padding[0],                                      // image padding
                 Parameters->GroupCount,                                      // filter groups
-                Filter, Bias, Input, Output, WorkingBuffer, ThreadPool);
+                Filter, Bias,
+                reinterpret_cast<const std::byte*>(Parameters->PackedFilter),
+                Parameters->PackedFilterGroupStride,
+                Input, Output, WorkingBuffer, Parameters->ChannelsLast, ThreadPool);
 
     MlasActivation(Parameters->Activation, Output, nullptr, Parameters->FilterCount, Parameters->OutputSize,
                    Parameters->OutputSize);
diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
index 1d620388bb5f0..3c9f398ece887 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
+++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -185,6 +185,7 @@ MlasConvPrepare(MLAS_CONV_PARAMETERS* Parameters,
                 size_t FilterCount,
                 const MLAS_ACTIVATION* Activation,
                 size_t* WorkingBufferSize,
+                bool ChannelsLast,
                 float Beta,
                 MLAS_THREADPOOL* ThreadPool);
 
@@ -200,15 +201,29 @@ MlasConv(
     MLAS_THREADPOOL* ThreadPool
     );
 
-#if defined(MLAS_ENABLE_TEST_HOOKS)
 size_t
 MLASCALL
-MlasConvLhsCacheEntryCountForTest();
+MlasConvSymmetricChannelsLast2DFloatPackWSize(
+    size_t FilterCount,
+    size_t InputChannels,
+    const int64_t* KernelShape,
+    const int64_t* DilationShape
+    );
 
 void
 MLASCALL
-MlasConvClearLhsCacheForTest();
-#endif
+MlasConvSymmetricChannelsLast2DFloatPackW(
+    size_t FilterCount,
+    size_t InputChannels,
+    const int64_t* KernelShape,
+    const int64_t* DilationShape,
+    size_t GroupCount,
+    const float* Filter,
+    const float* Bias,
+    void* PackedFilter,
+    size_t PackedFilterGroupStride,
+    MLAS_THREADPOOL* ThreadPool
+    );
 }
 
 /*++
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index 592733f0dd1f9..1193c1c5bbe27 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -853,6 +853,7 @@ void
     size_t FilterCount,
     const MLAS_ACTIVATION* Activation,
     size_t* WorkingBufferSize,
+    bool ChannelsLast,
     float Beta,
     MLAS_THREADPOOL* ThreadPool
     );
@@ -873,6 +874,7 @@ bool
     size_t FilterCount,
     const MLAS_ACTIVATION* Activation,
     size_t* WorkingBufferSize,
+    bool ChannelsLast,
     float Beta,
     MLAS_THREADPOOL* ThreadPool
     );
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
index 6bd116a3a417d..de521de0c3785 100644
--- a/onnxruntime/core/mlas/lib/platform.cpp
+++ b/onnxruntime/core/mlas/lib/platform.cpp
@@ -40,9 +40,13 @@ Module Name:
 #if defined(__linux__)
 #include <sys/auxv.h>
 #elif defined(_AIX)
+#include <sys/systemcfg.h>
+#if !defined(POWER_10)
 #define POWER_10       0x40000
+#endif
+#if !defined(POWER_10_ANDUP)
 #define POWER_10_ANDUP (POWER_10)
-#include <sys/systemcfg.h>
+#endif
 #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP)
 #elif defined(__FreeBSD__)
 #include <machine/cpu.h>
diff --git a/onnxruntime/core/mlas/lib/sve/mlasi_sve.h b/onnxruntime/core/mlas/lib/sve/mlasi_sve.h
index 922945c702119..3660bc108165c 100644
--- a/onnxruntime/core/mlas/lib/sve/mlasi_sve.h
+++ b/onnxruntime/core/mlas/lib/sve/mlasi_sve.h
@@ -20,6 +20,7 @@ Module Name:
 #ifndef __clang__
 #pragma GCC push_options
 #pragma GCC target("arch=armv8.2-a+sve")
+#endif
 
 // Use Clang-specific per-function attribute
 #ifdef __clang__
@@ -676,5 +677,4 @@ MlasSveCompareGreaterThan(svbool_t Pred, MLAS_SVFLOAT32 A, MLAS_SVFLOAT32 B)
 #pragma GCC pop_options
 #endif
 
-#endif
 
diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc
index cb6d65342bc54..1b8bb57d6a74e 100644
--- a/onnxruntime/core/optimizer/constant_folding.cc
+++ b/onnxruntime/core/optimizer/constant_folding.cc
@@ -2,15 +2,18 @@
 // Licensed under the MIT License.
 
 #include <limits>
+#include <string>
 
 #include "core/optimizer/constant_folding.h"
 #include "core/optimizer/initializer.h"
 #include "core/optimizer/utils.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/optimizer_execution_frame.h"
-#include "core/optimizer/utils.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensorprotoutils.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "core/common/safeint.h"
+#include "core/common/parse_string.h"
 
 using namespace onnxruntime::common;
 
@@ -140,6 +143,154 @@ static Status ConstantFoldIfNode(Graph& graph, Node& if_node, const logging::Log
   return status;
 }
 
+// Default maximum output size per constant-folded node: 1 GB.
+// This prevents malicious models from causing excessive memory allocation during optimization.
+static constexpr int64_t kDefaultConstantFoldingMaxOutputSizeInBytes = 1024 * 1024 * 1024;
+
+static size_t GetElementSizeForConstantFolding(ONNX_NAMESPACE::TensorProto_DataType elem_type) {
+  const size_t element_size = utils::GetElementSizeOfTensor(elem_type);
+  if (element_size != 0) {
+    return element_size;
+  }
+
+  // String tensors allocate storage for std::string slots even though the payload size is variable.
+  return elem_type == ONNX_NAMESPACE::TensorProto_DataType_STRING ? sizeof(std::string) : 0;
+}
+
+static int64_t EstimateTensorElementCount(const ONNX_NAMESPACE::TensorShapeProto& shape) {
+  SafeInt<int64_t> num_elements = 1;
+  for (int i = 0; i < shape.dim_size(); ++i) {
+    const auto& dim = shape.dim(i);
+    if (!utils::HasDimValue(dim)) {
+      return -1;  // Symbolic dimension
+    }
+    int64_t dim_value = dim.dim_value();
+    if (dim_value < 0) {
+      return -1;  // Invalid dimension
+    }
+    num_elements *= dim_value;
+  }
+
+  return num_elements;
+}
+
+static int64_t EstimateTensorSizeInBytes(const NodeArg& node_arg) {
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (type_proto == nullptr || !utils::HasTensorType(*type_proto)) {
+    return -1;  // Cannot estimate non-tensor or unknown types
+  }
+
+  const auto* shape = node_arg.Shape();
+  if (shape == nullptr) {
+    return -1;  // Unknown shape
+  }
+
+  auto elem_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(
+      type_proto->tensor_type().elem_type());
+  size_t element_size = GetElementSizeForConstantFolding(elem_type);
+  if (element_size == 0) {
+    return -1;  // Unknown element type
+  }
+
+  int64_t num_elements = EstimateTensorElementCount(*shape);
+  if (num_elements < 0) {
+    return -1;
+  }
+
+  return SafeInt<int64_t>(num_elements) * static_cast<int64_t>(element_size);
+}
+
+static int64_t EstimateUniqueOutputSizeInBytes(const Node& node) {
+  const auto& input_defs = node.InputDefs();
+  if (input_defs.empty() || input_defs[0] == nullptr) {
+    return -1;
+  }
+
+  const int64_t input_num_elements = input_defs[0]->Shape() != nullptr
+                                         ? EstimateTensorElementCount(*input_defs[0]->Shape())
+                                         : -1;
+  if (input_num_elements < 0) {
+    return -1;
+  }
+
+  const auto* input_type_proto = input_defs[0]->TypeAsProto();
+  if (input_type_proto == nullptr || !utils::HasTensorType(*input_type_proto)) {
+    return -1;
+  }
+
+  auto input_elem_type = static_cast<ONNX_NAMESPACE::TensorProto_DataType>(
+      input_type_proto->tensor_type().elem_type());
+  const size_t input_element_size = GetElementSizeForConstantFolding(input_elem_type);
+  if (input_element_size == 0) {
+    return -1;
+  }
+
+  SafeInt<int64_t> total_size = 0;
+  const auto& output_defs = node.OutputDefs();
+  for (size_t output_idx = 0; output_idx < output_defs.size(); ++output_idx) {
+    const auto* output_def = output_defs[output_idx];
+    if (!output_def->Exists()) {
+      continue;
+    }
+
+    const size_t element_size = output_idx == 0 ? input_element_size : sizeof(int64_t);
+    total_size += SafeInt<int64_t>(input_num_elements) * static_cast<int64_t>(element_size);
+  }
+
+  return total_size;
+}
+
+static int64_t EstimateIdentityOutputSizeInBytes(const Node& node) {
+  const auto& input_defs = node.InputDefs();
+  if (input_defs.empty() || input_defs[0] == nullptr) {
+    return -1;
+  }
+
+  return EstimateTensorSizeInBytes(*input_defs[0]);
+}
+
+// Estimate the total output size in bytes for a node using shape inference results.
+// Returns -1 if the output size cannot be estimated (e.g., unknown shapes or types).
+static int64_t EstimateNodeOutputSizeInBytes(const Node& node) {
+  if (node.OpType() == "Identity" && node.Domain().empty()) {
+    return EstimateIdentityOutputSizeInBytes(node);
+  }
+
+  if (node.OpType() == "Unique" && node.Domain().empty()) {
+    return EstimateUniqueOutputSizeInBytes(node);
+  }
+
+  SafeInt<int64_t> total_size = 0;
+  for (const auto* output_def : node.OutputDefs()) {
+    if (!output_def->Exists()) {
+      continue;
+    }
+
+    const int64_t output_size = EstimateTensorSizeInBytes(*output_def);
+    if (output_size < 0) {
+      return -1;
+    }
+
+    total_size += output_size;
+  }
+
+  return total_size;
+}
+
+// Get the configured max output size from session options, or use the default.
+static int64_t GetConstantFoldingMaxOutputSize(const ConfigOptions& config_options) {
+  std::string max_size_str = config_options.GetConfigOrDefault(
+      kOrtSessionOptionsConstantFoldingMaxOutputSizeInBytes,
+      std::to_string(kDefaultConstantFoldingMaxOutputSizeInBytes));
+
+  int64_t max_size = 0;
+  if (!TryParseStringWithClassicLocale(max_size_str, max_size) || max_size < 0) {
+    max_size = kDefaultConstantFoldingMaxOutputSizeInBytes;
+  }
+
+  return max_size;
+}
+
 Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
   bool have_updated_nodes = false;
   GraphViewer graph_viewer(graph);
@@ -151,6 +302,8 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
   };
 #endif
 
+  const int64_t max_output_size = GetConstantFoldingMaxOutputSize(config_options_);
+
   for (NodeIndex i : order) {
     auto* node = graph.GetNode(i);
     if (!node || !AllowConstantFolding(*node)) {
@@ -233,6 +386,35 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
         }
       }
 
+      // Check if the estimated output size exceeds the configured limit.
+      // This prevents malicious models from causing excessive memory allocation during constant folding.
+      if (max_output_size > 0) {
+        int64_t estimated_size = -1;
+        try {
+          estimated_size = EstimateNodeOutputSizeInBytes(*node);
+        } catch (const std::exception&) {
+          // SafeInt overflow means the size is astronomically large - definitely skip
+          LOGS(logger, WARNING) << "Integer overflow while estimating output size of "
+                                << node->OpType() << " node '" << node->Name()
+                                << "'. Skipping constant folding for this node.";
+          continue;
+        }
+
+        if (estimated_size > max_output_size) {
+          LOGS(logger, WARNING) << "Skipping constant folding for " << node->OpType()
+                                << " node '" << node->Name()
+                                << "' because estimated output size (" << estimated_size
+                                << " bytes) exceeds the limit (" << max_output_size << " bytes).";
+          continue;
+        }
+        if (estimated_size < 0) {
+          LOGS(logger, INFO) << "Skipping constant folding for " << node->OpType()
+                             << " node '" << node->Name()
+                             << "' because output size could not be estimated before execution.";
+          continue;
+        }
+      }
+
 #if !defined(DISABLE_SPARSE_TENSORS)
       // Create execution frame for executing constant nodes.
       OptimizerExecutionFrame::Info info({node}, constant_inputs, graph.ModelPath(), execution_provider_,
@@ -312,7 +494,25 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 #pragma warning(disable : 6387)
 #endif
       OpKernelContext op_kernel_context(&frame, kernel.get(), /*stream*/ nullptr, nullptr, logger);
-      ORT_RETURN_IF_ERROR(kernel->Compute(&op_kernel_context));
+
+      // Skip the current node if Compute fails so one bad constant-fold candidate does not abort
+      // the entire constant folding pass.
+      Status compute_status = Status::OK();
+      try {
+        compute_status = kernel->Compute(&op_kernel_context);
+      } catch (const std::exception& ex) {
+        LOGS(logger, WARNING) << "Exception during constant folding of " << node->OpType()
+                              << " node '" << node->Name() << "': " << ex.what()
+                              << ". Skipping constant folding for this node.";
+        continue;
+      }
+
+      if (!compute_status.IsOK()) {
+        LOGS(logger, WARNING) << "Failure during constant folding of " << node->OpType()
+                              << " node '" << node->Name() << "': " << compute_status.ErrorMessage()
+                              << ". Skipping constant folding for this node.";
+        continue;
+      }
 #ifdef _WIN32
 #pragma warning(pop)
 #endif
@@ -320,6 +520,33 @@ Status ConstantFolding::ApplyImpl(Graph& graph, bool& modified, int graph_level,
       std::vector<OrtValue> fetches;
       ORT_RETURN_IF_ERROR(frame.GetOutputs(fetches));
 
+      // Post-execution size check: verify actual output sizes don't exceed the limit.
+      // This catches cases where pre-execution shape inference couldn't determine the output size.
+      if (max_output_size > 0) {
+        SafeInt<int64_t> actual_total_size = 0;
+        bool size_exceeded = false;
+        try {
+          for (size_t fetch_idx = 0; fetch_idx < fetches.size(); ++fetch_idx) {
+            if (fetches[fetch_idx].IsAllocated() && fetches[fetch_idx].IsTensor()) {
+              const auto& tensor = fetches[fetch_idx].Get<Tensor>();
+              actual_total_size += tensor.SizeInBytes();
+            }
+          }
+          size_exceeded = actual_total_size > max_output_size;
+        } catch (const std::exception&) {
+          // SafeInt overflow means total size is astronomically large
+          size_exceeded = true;
+        }
+
+        if (size_exceeded) {
+          LOGS(logger, WARNING) << "Skipping constant folding for " << node->OpType()
+                                << " node '" << node->Name()
+                                << "' because actual output size exceeds the limit ("
+                                << max_output_size << " bytes).";
+          continue;
+        }
+      }
+
       // Go over all output node args and substitute them with the newly computed tensors, which will be
       // added to the graph as initializers.
       ORT_ENFORCE(fetches.size() == fetch_to_output_idx.size());
diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc
index 44a20428a09e0..35f3683b77764 100644
--- a/onnxruntime/core/optimizer/conv_activation_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc
@@ -140,9 +140,12 @@ class FuseConvActivationAction : public ReplaceWithNew {
         return "FusedConv";
       }
     } else if (domain == kMSDomain) {
-      if (op_type == "NhwcConv") {
+      if (op_type == "NhwcConv" || op_type == "NhwcFusedConv") {
         return "NhwcFusedConv";
       }
+      if (op_type == "FusedConv") {
+        return "FusedConv";
+      }
     } else if (domain == kMSInternalNHWCDomain) {
       if (op_type == "Conv") {
         return "Conv";
diff --git a/onnxruntime/core/optimizer/conv_add_act_fusion.cc b/onnxruntime/core/optimizer/conv_add_act_fusion.cc
index 45441d20a4112..d51ea894725fa 100644
--- a/onnxruntime/core/optimizer/conv_add_act_fusion.cc
+++ b/onnxruntime/core/optimizer/conv_add_act_fusion.cc
@@ -211,7 +211,22 @@ class FuseConvAddActivationAction : public ReplaceWithNew {
 
  private:
   std::string OpType(const RuntimeState& runtimeState) const override {
-    return (runtimeState.selected_nodes.Target().OpType() == "Conv") ? "FusedConv" : "NhwcFusedConv";
+    const auto& target = runtimeState.selected_nodes.Target();
+    const auto* channels_last_attr = graph_utils::GetNodeAttribute(target, "channels_last");
+    const bool channels_last = channels_last_attr != nullptr && channels_last_attr->i() != 0;
+    const std::string& op_type = target.OpType();
+
+    // If channels_last is set, use NHWC fused convolution regardless of original op type.
+    if (channels_last) {
+      return "NhwcFusedConv";
+    }
+
+    // Without channels_last, convert Conv to FusedConv, and leave other op types unchanged.
+    if (op_type == "Conv") {
+      return "FusedConv";
+    }
+
+    return op_type;
   }
 
   std::string Domain(const RuntimeState&) const override { return kMSDomain; }
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 282261cab0e58..702f20a96dccf 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -463,7 +463,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
 
       auto cpu_registry = cpu_execution_provider.GetKernelRegistry();
       auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry),
-                                                                logger);
+                                                                logger, session_options.config_options);
       if (nhwc_transformer->IsActive()) {
         transformers.emplace_back(std::move(nhwc_transformer));
       }
@@ -566,7 +566,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
         AllocatorPtr cpu_allocator = CPUAllocator::DefaultInstance();
         auto cpu_registry = cpu_execution_provider.GetKernelRegistry();
         auto nhwc_transformer = std::make_unique<NhwcTransformer>(std::move(cpu_allocator), std::move(cpu_registry),
-                                                                  logger);
+                                                                  logger, session_options.config_options);
         if (nhwc_transformer->IsActive()) {
           transformers.emplace_back(std::move(nhwc_transformer));
         }
diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h b/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h
index 81eb9f59eeada..3e3a010728086 100644
--- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h
+++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h
@@ -70,6 +70,7 @@ inline constexpr std::array kLayoutTransformationPotentiallyAddedOps = {
 #if !defined(DISABLE_CONTRIB_OPS)
     // kMSDomain ops
     OpIdentifierWithStringViews{kMSDomain, "DequantizeLinear", 1},
+    OpIdentifierWithStringViews{kMSDomain, "NhwcFusedConv", 1},
     OpIdentifierWithStringViews{kMSDomain, "NhwcMaxPool", 1},
     OpIdentifierWithStringViews{kMSDomain, "QLinearConv", 1},
     OpIdentifierWithStringViews{kMSDomain, "QuantizeLinear", 1},
diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc
index b9366ff0abae8..a971a058f43b7 100644
--- a/onnxruntime/core/optimizer/nchwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nchwc_transformer.cc
@@ -324,6 +324,18 @@ void NchwcTransformerImpl::ConvPoolShapeInference(const Node& node,
 void NchwcTransformerImpl::TransformConv(Node& node) {
   auto& input_defs = node.MutableInputDefs();
   auto& output_defs = node.MutableOutputDefs();
+  NchwcArgument* nchwc_sum_input = nullptr;
+
+  // The internal NCHWc Conv kernel can consume an optional fused Sum input, but it expects
+  // that tensor to already be in NCHWc layout. Only allow transforming a pre-existing
+  // FusedConv(X, W, B, Sum) when the Sum input already has a tracked NCHWc variant that
+  // can be wired through directly.
+  if (node.OpType() == "FusedConv" && input_defs.size() >= 4 && input_defs[3] != nullptr && input_defs[3]->Exists()) {
+    nchwc_sum_input = LookupNchwcArgument(input_defs[3]);
+    if (nchwc_sum_input == nullptr) {
+      return;
+    }
+  }
 
   // Require that the weights tensor be static.
   const ONNX_NAMESPACE::TensorProto* conv_W_tensor_proto = nullptr;
@@ -490,6 +502,11 @@ void NchwcTransformerImpl::TransformConv(Node& node) {
     nchwc_node.MutableInputDefs()[2] = nchwc_conv_B_arg;
   }
 
+  if (nchwc_sum_input != nullptr) {
+    nchwc_node.MutableInputDefs()[3] = nchwc_sum_input->nchwc_arg_;
+    nchwc_sum_input->remaining_original_uses_--;
+  }
+
   NchwcArgument::Shape output_shape(output_defs[0]);
 
   if (do_reorder_input) {
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.cc b/onnxruntime/core/optimizer/nhwc_transformer.cc
index cd654991c92d5..6c0717865b135 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.cc
+++ b/onnxruntime/core/optimizer/nhwc_transformer.cc
@@ -2,7 +2,12 @@
 // SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
 // Licensed under the MIT License.
 
+#include <array>
+#include <cstdint>
 #include <deque>
+#include <vector>
+#include "core/common/cpuid_info.h"
+#include "core/graph/constants.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/initializer.h"
@@ -11,6 +16,8 @@
 #include "core/optimizer/layout_transformation/layout_transformation.h"
 #include "core/optimizer/transpose_optimization/ort_optimizer_utils.h"
 #include "core/optimizer/transpose_optimization/ort_transpose_optimization.h"
+#include "core/providers/common.h"
+#include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h"
 
 using namespace ONNX_NAMESPACE;
 using namespace ::onnxruntime::common;
@@ -21,6 +28,221 @@ namespace onnxruntime {
 
 using namespace layout_transformation;
 
+#ifdef USE_KLEIDIAI
+namespace {
+
+bool TryGetDimValueAsSizeT(const ONNX_NAMESPACE::TensorShapeProto& shape, int index, size_t& value) {
+  if (shape.dim_size() <= index || !shape.dim(index).has_dim_value()) {
+    return false;
+  }
+
+  const int64_t dim_value = shape.dim(index).dim_value();
+  if (dim_value < 0) {
+    return false;
+  }
+
+  value = narrow<size_t>(dim_value);
+  return true;
+}
+
+bool TryReadPositiveInts(const std::vector<int64_t>& values, std::array<size_t, 2>& out) {
+  if (values.size() != out.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < out.size(); ++i) {
+    if (values[i] <= 0) {
+      return false;
+    }
+
+    out[i] = narrow<size_t>(values[i]);
+  }
+
+  return true;
+}
+
+bool TryReadPositiveOrZeroInts(const std::vector<int64_t>& values, std::array<size_t, 4>& out) {
+  if (values.size() != out.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < out.size(); ++i) {
+    if (values[i] < 0) {
+      return false;
+    }
+
+    out[i] = narrow<size_t>(values[i]);
+  }
+
+  return true;
+}
+
+bool TryParseAutoPadType(std::string_view value, AutoPadType& auto_pad_type) {
+  if (value.empty() || value == "NOTSET") {
+    auto_pad_type = AutoPadType::NOTSET;
+    return true;
+  }
+
+  if (value == "VALID") {
+    auto_pad_type = AutoPadType::VALID;
+    return true;
+  }
+
+  if (value == "SAME_UPPER") {
+    auto_pad_type = AutoPadType::SAME_UPPER;
+    return true;
+  }
+
+  if (value == "SAME_LOWER") {
+    auto_pad_type = AutoPadType::SAME_LOWER;
+    return true;
+  }
+
+  return false;
+}
+
+bool TryComputeFloatNhwcPads(const api::NodeRef& node,
+                             const std::array<size_t, 2>& input_shape,
+                             const std::array<size_t, 2>& kernel_shape,
+                             const std::array<size_t, 2>& strides,
+                             const std::array<size_t, 2>& dilations,
+                             std::array<size_t, 4>& pads) {
+  for (size_t i = 0; i < 2; ++i) {
+    if (kernel_shape[i] == 0 || strides[i] == 0 || dilations[i] == 0) {
+      return false;
+    }
+  }
+
+  const auto auto_pad_value = node.GetAttributeString("auto_pad");
+  AutoPadType auto_pad = AutoPadType::NOTSET;
+  if (!TryParseAutoPadType(auto_pad_value.value_or("NOTSET"), auto_pad)) {
+    return false;
+  }
+
+  if (auto_pad == AutoPadType::NOTSET) {
+    const auto pads_opt = node.GetAttributeInts("pads");
+    if (!pads_opt.has_value()) {
+      pads.fill(0);
+      return true;
+    }
+
+    return TryReadPositiveOrZeroInts(*pads_opt, pads);
+  }
+
+  std::array<int64_t, 4> pads_int64{};
+  for (size_t i = 0; i < 2; ++i) {
+    int64_t pad_head = 0;
+    int64_t pad_tail = 0;
+    int64_t out_dim = 0;
+    const auto status = ComputePadAndOutputShape(
+        narrow<int64_t>(input_shape[i]),
+        narrow<int64_t>(strides[i]),
+        narrow<int64_t>(kernel_shape[i]),
+        narrow<int64_t>(dilations[i]),
+        auto_pad,
+        pad_head,
+        pad_tail,
+        out_dim,
+        /*force_symmetric_auto_padding*/ false);
+    if (!status.IsOK() || pad_head < 0 || pad_tail < 0 || out_dim < 0) {
+      return false;
+    }
+
+    pads_int64[i] = pad_head;
+    pads_int64[i + 2] = pad_tail;
+  }
+
+  for (size_t i = 0; i < pads.size(); ++i) {
+    pads[i] = narrow<size_t>(pads_int64[i]);
+  }
+
+  return true;
+}
+
+bool FloatNhwcWrapperFilter(const onnx_transpose_optimization::api::GraphRef& graph,
+                            onnx_transpose_optimization::api::NodeRef& node) {
+  auto& base_node = NodeFromApiNode(node);
+
+  ORT_UNUSED_PARAMETER(graph);
+#if !defined(MLAS_TARGET_ARM64)
+  return false;
+#else
+  if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) {
+    return false;
+  }
+
+  if (base_node.InputDefs().size() < 2) {
+    return false;
+  }
+
+  const auto* input_shape = base_node.InputDefs()[0]->Shape();
+  if (input_shape == nullptr || input_shape->dim_size() != 4) {
+    return false;
+  }
+
+  const auto* weight_shape = base_node.InputDefs()[1]->Shape();
+  if (weight_shape == nullptr || weight_shape->dim_size() != 4) {
+    return false;
+  }
+
+  const auto inputs = node.Inputs();
+  if (base_node.OpType() == "FusedConv" && inputs.size() > 3 && !inputs[3].empty()) {
+    return false;
+  }
+
+  const auto group = node.GetAttributeInt("group").value_or(1);
+  if (group != 1) {
+    return false;
+  }
+
+  std::array<size_t, 2> input_spatial_shape{};
+  std::array<size_t, 2> kernel_spatial_shape{};
+  std::array<size_t, 2> dilations{1, 1};
+  std::array<size_t, 2> strides{1, 1};
+  std::array<size_t, 4> pads{};
+  size_t batch_count = 0;
+  size_t filter_count = 0;
+
+  if (!TryGetDimValueAsSizeT(*input_shape, 0, batch_count) ||
+      !TryGetDimValueAsSizeT(*input_shape, 2, input_spatial_shape[0]) ||
+      !TryGetDimValueAsSizeT(*input_shape, 3, input_spatial_shape[1]) ||
+      !TryGetDimValueAsSizeT(*weight_shape, 0, filter_count) ||
+      !TryGetDimValueAsSizeT(*weight_shape, 2, kernel_spatial_shape[0]) ||
+      !TryGetDimValueAsSizeT(*weight_shape, 3, kernel_spatial_shape[1])) {
+    return false;
+  }
+
+  const auto dilations_opt = node.GetAttributeInts("dilations");
+  if (dilations_opt.has_value() && !TryReadPositiveInts(*dilations_opt, dilations)) {
+    return false;
+  }
+
+  const auto strides_opt = node.GetAttributeInts("strides");
+  if (strides_opt.has_value() && !TryReadPositiveInts(*strides_opt, strides)) {
+    return false;
+  }
+
+  if (!TryComputeFloatNhwcPads(node, input_spatial_shape, kernel_spatial_shape, strides, dilations, pads)) {
+    return false;
+  }
+
+  return MlasConvSupportsSymmetricChannelsLast2DFloatKernel(
+      /*Dimensions*/ 2,
+      batch_count,
+      /*GroupCount*/ 1,
+      input_spatial_shape.data(),
+      kernel_spatial_shape.data(),
+      dilations.data(),
+      pads.data(),
+      strides.data(),
+      filter_count,
+      /*Beta*/ 0.0f);
+#endif
+}
+
+}  // namespace
+#endif
+
 static inline const OpTransformInfo*
 NhwcConvLookup(
     const OpTransformMap& conv_table,
@@ -41,18 +263,29 @@ NhwcConvLookup(
   if (iter == conv_table.end()) {
     return nullptr;
   }
+
+  if (iter->second.filter_ != nullptr) {
+    if (!iter->second.filter_(graph, node)) {
+      return nullptr;
+    }
+  }
+
   return &(iter->second);
 }
 
 NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
                                  std::shared_ptr<KernelRegistry> cpu_kernel_registry,
-                                 const logging::Logger& logger) noexcept
+                                 const logging::Logger& logger,
+                                 const ConfigOptions& config_options) noexcept
     : GraphTransformer("NhwcTransformer"), cpu_allocator_(std::move(cpu_allocator)) {
   if (!cpu_kernel_registry) {
     // This is a CPU op nodes optimizer, not useful if cpu EP is not available.
     return;
   }
 
+  MLAS_BACKEND_KERNEL_SELECTOR_CONFIG mlas_backend_kernel_selector_config{};
+  SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config, config_options);
+
   //
   // Constructing a mapping table from operators to be transformed to their target.
   // Make sure that the new nodes we are about to create during graph transformation,
@@ -71,10 +304,10 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
       kernel_create_info = nullptr;
       conv_table_.emplace(
           OpIdInfo("QLinearConv", kOnnxDomain, api::DataType::INT8),
-          OpTransformInfo{qconv_int8.op_type_, qconv_int8.domain_, qconv_int8.version_, true});
+          OpTransformInfo{qconv_int8.op_type_, qconv_int8.domain_, qconv_int8.version_, true, false});
       conv_table_.emplace(
           OpIdInfo("QLinearConv", kMSDomain, api::DataType::INT8),
-          OpTransformInfo{qconv_int8.op_type_, qconv_int8.domain_, qconv_int8.version_, true});
+          OpTransformInfo{qconv_int8.op_type_, qconv_int8.domain_, qconv_int8.version_, true, false});
     }
   }
 
@@ -90,10 +323,10 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
       kernel_create_info = nullptr;
       conv_table_.emplace(
           OpIdInfo("QLinearConv", kOnnxDomain, api::DataType::UINT8),
-          OpTransformInfo{qconv_uint8.op_type_, qconv_uint8.domain_, qconv_uint8.version_, true});
+          OpTransformInfo{qconv_uint8.op_type_, qconv_uint8.domain_, qconv_uint8.version_, true, false});
       conv_table_.emplace(
           OpIdInfo("QLinearConv", kMSDomain, api::DataType::UINT8),
-          OpTransformInfo{qconv_uint8.op_type_, qconv_uint8.domain_, qconv_uint8.version_, true});
+          OpTransformInfo{qconv_uint8.op_type_, qconv_uint8.domain_, qconv_uint8.version_, true, false});
     }
   }
 
@@ -108,14 +341,61 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
         nhwc_conv_fp16.version_, nhwc_conv_fp16.type_constraints_, logger, &kernel_create_info);
     if (status.IsOK() && kernel_create_info != nullptr) {
       kernel_create_info = nullptr;
+      const auto filter = [](const api::GraphRef&, api::NodeRef& node) {
+        const auto dilations_opt = node.GetAttributeInts("dilations");
+        if (dilations_opt.has_value()) {
+          const auto& dilations = dilations_opt.value();
+          if ((dilations.size() >= 1 && dilations[0] != 1) ||
+              (dilations.size() >= 2 && dilations[1] != 1)) {
+            return false;
+          }
+        }
+
+        const auto group_opt = node.GetAttributeInt("group");
+        if (group_opt.has_value() && group_opt.value() != 1) {
+          return false;
+        }
+
+        return true;
+      };
+
       conv_table_.emplace(
           OpIdInfo("Conv", kOnnxDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false});
+          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false, false, filter});
       conv_table_.emplace(
           OpIdInfo("FusedConv", kMSDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false});
+          OpTransformInfo{nhwc_conv_fp16.op_type_, nhwc_conv_fp16.domain_, nhwc_conv_fp16.version_, false, true, filter});
+    }
+  }
+
+#ifdef USE_KLEIDIAI
+  // KleidiAI specific block for NhwcFusedConvolutions
+  if (mlas_backend_kernel_selector_config.use_kleidiai) {
+    // F32 Conv -> F32 NHWC Conv
+    OpKernelRegistryId nhwc_conv_fp32{
+        "NhwcFusedConv", kMSDomain, 1, {{"T", {DataTypeImpl::GetTensorType<float>()}}}};
+
+    const KernelCreateInfo* kernel_create_info{};
+    const auto status = cpu_kernel_registry->TryFindKernel(
+        kCpuExecutionProvider, nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_,
+        nhwc_conv_fp32.version_, nhwc_conv_fp32.type_constraints_, logger, &kernel_create_info);
+
+    if (status.IsOK() && kernel_create_info != nullptr) {
+      kernel_create_info = nullptr;
+
+      const auto filter = [](const api::GraphRef& graph, api::NodeRef& node) {
+        return FloatNhwcWrapperFilter(graph, node);
+      };
+
+      conv_table_.emplace(
+          OpIdInfo("Conv", kOnnxDomain, api::DataType::FLOAT),
+          OpTransformInfo{nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, nhwc_conv_fp32.version_, false, true, filter});
+      conv_table_.emplace(
+          OpIdInfo("FusedConv", kMSDomain, api::DataType::FLOAT),
+          OpTransformInfo{nhwc_conv_fp32.op_type_, nhwc_conv_fp32.domain_, nhwc_conv_fp32.version_, false, true, filter});
     }
   }
+#endif
 
   {
     // fp16 MaxPool -> fp16 nhwc MaxPool
@@ -130,7 +410,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
       kernel_create_info = nullptr;
       conv_table_.emplace(
           OpIdInfo("MaxPool", kOnnxDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_maxpool_fp16.op_type_, nhwc_maxpool_fp16.domain_, nhwc_maxpool_fp16.version_, false});
+          OpTransformInfo{nhwc_maxpool_fp16.op_type_, nhwc_maxpool_fp16.domain_, nhwc_maxpool_fp16.version_, false, false});
     }
   }
 
@@ -147,7 +427,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
       kernel_create_info = nullptr;
       conv_table_.emplace(
           OpIdInfo("AveragePool", kOnnxDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_avgpool_fp16.op_type_, nhwc_avgpool_fp16.domain_, nhwc_avgpool_fp16.version_, false});
+          OpTransformInfo{nhwc_avgpool_fp16.op_type_, nhwc_avgpool_fp16.domain_, nhwc_avgpool_fp16.version_, false, false});
     }
   }
 
@@ -164,7 +444,7 @@ NhwcTransformer::NhwcTransformer(AllocatorPtr cpu_allocator,
       kernel_create_info = nullptr;
       conv_table_.emplace(
           OpIdInfo("GlobalAveragePool", kOnnxDomain, api::DataType::FLOAT16),
-          OpTransformInfo{nhwc_gavgpool_fp16.op_type_, nhwc_gavgpool_fp16.domain_, nhwc_gavgpool_fp16.version_, false});
+          OpTransformInfo{nhwc_gavgpool_fp16.op_type_, nhwc_gavgpool_fp16.domain_, nhwc_gavgpool_fp16.version_, false, false});
     }
   }
 };
@@ -214,10 +494,22 @@ Status NhwcTransformer::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     if (transform->has_channels_last_attrib_) {
       node->SetAttributeInt("channels_last", 1);
     }
+
     size_t rank = shape->dim_size();
     std::vector<int64_t> input_perm = ChannelFirstToLastPerm(rank);
     std::vector<int64_t> output_perm = ChannelLastToFirstPerm(rank);
-    WrapTransposesAroundNode(*api_graph, *node, {&input_perm}, {&output_perm});
+    const auto inputs = node->Inputs();
+    std::vector<const std::vector<int64_t>*> input_perms(inputs.size(), nullptr);
+    if (!inputs.empty()) {
+      input_perms[0] = &input_perm;
+    }
+    // Some transformed operators require the optional fused Sum (Z) input at index 3
+    // to be converted alongside the activation tensor.
+    if (transform->transpose_fused_sum_input_ && inputs.size() > 3 && !inputs[3].empty()) {
+      input_perms[3] = &input_perm;
+    }
+
+    WrapTransposesAroundNode(*api_graph, *node, input_perms, {&output_perm});
 
     // Replace the operator if needed
     if (node->Domain() != transform->domain_ ||
diff --git a/onnxruntime/core/optimizer/nhwc_transformer.h b/onnxruntime/core/optimizer/nhwc_transformer.h
index c65f851fdab9d..4755ceb316fef 100644
--- a/onnxruntime/core/optimizer/nhwc_transformer.h
+++ b/onnxruntime/core/optimizer/nhwc_transformer.h
@@ -3,7 +3,9 @@
 
 #pragma once
 
+#include <functional>
 #include "core/common/common.h"
+#include "core/framework/config_options.h"
 #include "core/framework/execution_provider.h"
 #include "core/framework/kernel_registry.h"
 #include "core/optimizer/graph_transformer.h"
@@ -54,10 +56,15 @@ class OpIdHash {
  * @brief Information needed for operator layout transformation
  */
 struct OpTransformInfo {
+  using FilterFn = std::function<bool(const onnx_transpose_optimization::api::GraphRef& graph,
+                                      onnx_transpose_optimization::api::NodeRef& node)>;
+
   const std::string optype_;
   const std::string domain_;
   const int version_;
   const bool has_channels_last_attrib_;
+  const bool transpose_fused_sum_input_;
+  const FilterFn filter_{nullptr};
 };
 
 using OpTransformMap = std::unordered_map<OpIdInfo, OpTransformInfo, OpIdHash>;
@@ -76,7 +83,7 @@ class NhwcTransformer : public GraphTransformer {
  private:
  public:
   explicit NhwcTransformer(AllocatorPtr cpu_allocator, std::shared_ptr<KernelRegistry> cpu_kernel_registry,
-                           const logging::Logger& logger) noexcept;
+                           const logging::Logger& logger, const ConfigOptions& config_options) noexcept;
 
   /**
    * @brief Usually called right after constructor, it shows whether
diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc
index 49fe180b9470c..713cbcc6193d3 100644
--- a/onnxruntime/core/optimizer/stft_decomposition.cc
+++ b/onnxruntime/core/optimizer/stft_decomposition.cc
@@ -9,6 +9,7 @@
 #include "core/graph/graph_utils.h"
 #include "core/optimizer/optimizer_execution_frame.h"
 #include "core/optimizer/utils.h"
+#include "core/common/safeint.h"
 #include "core/framework/op_kernel.h"
 #include "core/framework/tensorprotoutils.h"
 #include <numbers>
@@ -210,6 +211,14 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
       dft_size = window_length_dim.dim_value();
     }
 
+    // Validate model-provided scalar values before using them in size calculations.
+    // These come from untrusted model initializers/shapes and must be positive.
+    if (dft_size <= 0 || frame_step_value <= 0) {
+      LOGS(logger, WARNING) << "STFT decomposition skipped: invalid dft_size (" << dft_size
+                            << ") or frame_step_value (" << frame_step_value << ")";
+      continue;
+    }
+
     bool is_onesided = true;
     auto& attrs = stft.GetAttributes();
     if (attrs.find("onesided") != attrs.end()) {
@@ -227,14 +236,22 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
     if (is_real) {
       auto output_num_frames = stft.MutableOutputDefs()[0]->Shape()->dim(1).dim_value();
       auto output_frame_length = stft.MutableOutputDefs()[0]->Shape()->dim(2).dim_value();
-      auto weight_size = static_cast<size_t>(dft_unique_bins * dft_size);
+
+      size_t dft_size_sz, dft_unique_bins_sz, weight_size;
+      if (!SafeCast(dft_unique_bins, dft_unique_bins_sz) ||
+          !SafeCast(dft_size, dft_size_sz) ||
+          !SafeMultiply(dft_unique_bins_sz, dft_size_sz, weight_size)) {
+        LOGS(logger, WARNING) << "STFT decomposition skipped: weight size overflow";
+        continue;
+      }
+
       auto real_weights_data = std::vector<float>(weight_size);
       auto imag_weights_data = std::vector<float>(weight_size);
 
       // Populate weights
-      for (size_t k = 0; k < static_cast<size_t>(dft_unique_bins); k++) {
-        for (size_t n = 0; n < static_cast<size_t>(dft_size); n++) {
-          auto index = static_cast<size_t>(k * dft_size + n);
+      for (size_t k = 0; k < dft_unique_bins_sz; k++) {
+        for (size_t n = 0; n < dft_size_sz; n++) {
+          auto index = k * dft_size_sz + n;
           auto theta = -2 * std::numbers::pi_v<float> * k * n / static_cast<float>(dft_size);
           real_weights_data[index] = static_cast<float>(cos(theta));
           imag_weights_data[index] = static_cast<float>(sin(theta));
@@ -356,7 +373,6 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
 
     // Copy inputs
     auto signal_target_idx = signal_recipient->Index();
-    auto window_target_idx = window_recipient->Index();
     for (auto cur = input_edges.cbegin(), end = input_edges.cend(); cur != end; ++cur) {
       const graph_utils::GraphEdge& edge = *cur;
       NodeIndex target_idx = 0;
@@ -367,8 +383,10 @@ Status STFTDecomposition::ApplyImpl(Graph& graph, bool& modified, int graph_leve
           recipient = signal_recipient;
           break;
         case 2:
-          target_idx = window_target_idx;
-          recipient = window_recipient;
+          if (window_recipient) {
+            target_idx = window_recipient->Index();
+            recipient = window_recipient;
+          }
           break;
       }
 
diff --git a/onnxruntime/core/platform/env.h b/onnxruntime/core/platform/env.h
index 8412f8c9b4f6f..f45f6c088d2a5 100644
--- a/onnxruntime/core/platform/env.h
+++ b/onnxruntime/core/platform/env.h
@@ -231,6 +231,12 @@ class Env {
       const PathString& path,
       PathString& canonical_path) const = 0;
 
+  /** Like GetCanonicalPath, but the path is not required to exist. Mirrors
+   *  std::filesystem::weakly_canonical. */
+  virtual common::Status GetWeaklyCanonicalPath(
+      const PathString& path,
+      PathString& canonical_path) const = 0;
+
   // This functions is always successful. It can't fail.
   virtual PIDType GetSelfPid() const = 0;
 
diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc
index ba7af3b383b54..cf28d7a0cce2a 100644
--- a/onnxruntime/core/platform/posix/env.cc
+++ b/onnxruntime/core/platform/posix/env.cc
@@ -543,6 +543,19 @@ class PosixEnv : public Env {
     return Status::OK();
   }
 
+  common::Status GetWeaklyCanonicalPath(
+      const PathString& path,
+      PathString& canonical_path) const override {
+    std::error_code ec;
+    auto canonical = std::filesystem::weakly_canonical(std::filesystem::path{path}, ec);
+    if (ec) {
+      return common::Status(common::ONNXRUNTIME, common::FAIL,
+                            "Failed to get the weakly canonical path: " + path + " - " + ec.message());
+    }
+    canonical_path.assign(canonical.native());
+    return Status::OK();
+  }
+
   common::Status LoadDynamicLibrary(const PathString& library_filename, bool global_symbols, void** handle) const override {
     dlerror();  // clear any old error_str
     *handle = dlopen(library_filename.c_str(), RTLD_NOW | (global_symbols ? RTLD_GLOBAL : RTLD_LOCAL));
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 4d80b5afff4b8..07d5dfc9c0b22 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <iostream>
 #include <fstream>
+#include <filesystem>
 #include <optional>
 #include <string>
 #include <thread>
@@ -685,6 +686,136 @@ common::Status WindowsEnv::GetCanonicalPath(
   return Status::OK();
 }
 
+namespace {
+
+constexpr std::wstring_view kGlobalRootPrefix{L"\\\\?\\GLOBALROOT"};
+
+wil::unique_hfile OpenHandleForFinalPath(const std::filesystem::path& path) {
+  CREATEFILE2_EXTENDED_PARAMETERS params{};
+  params.dwSize = sizeof(params);
+  params.dwFileFlags = FILE_FLAG_BACKUP_SEMANTICS;
+  return wil::unique_hfile{::CreateFile2(path.c_str(),
+                                         FILE_READ_ATTRIBUTES,
+                                         FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                                         OPEN_EXISTING,
+                                         &params)};
+}
+
+// Final-path query using VOLUME_NAME_NT, prefixed with "\\?\GLOBALROOT" to stay a valid Win32 path.
+bool TryGetFinalPathNt(const std::filesystem::path& path, std::filesystem::path& result) {
+  wil::unique_hfile handle = OpenHandleForFinalPath(path);
+  if (handle.get() == INVALID_HANDLE_VALUE) {
+    return false;
+  }
+
+  std::wstring buffer(MAX_PATH, L'\0');
+  constexpr DWORD kFlags = FILE_NAME_NORMALIZED | VOLUME_NAME_NT;
+  DWORD needed = ::GetFinalPathNameByHandleW(handle.get(), buffer.data(),
+                                             static_cast<DWORD>(buffer.size()), kFlags);
+  if (needed != 0 && needed >= buffer.size()) {
+    buffer.resize(needed);
+    needed = ::GetFinalPathNameByHandleW(handle.get(), buffer.data(),
+                                         static_cast<DWORD>(buffer.size()), kFlags);
+  }
+
+  if (needed == 0 || needed >= buffer.size()) {
+    return false;
+  }
+  buffer.resize(needed);
+
+  std::wstring prefixed;
+  prefixed.reserve(kGlobalRootPrefix.size() + buffer.size());
+  prefixed.append(kGlobalRootPrefix);
+  prefixed.append(buffer);
+  result = std::filesystem::path(std::move(prefixed));
+  return true;
+}
+
+// weakly_canonical analogue using TryGetFinalPathNt for the existing prefix.
+bool TryWeaklyCanonicalPathNtVolume(const std::filesystem::path& input,
+                                    std::filesystem::path& result) {
+  std::filesystem::path head = input;
+  std::filesystem::path tail;
+  std::filesystem::path canonical_head;
+  bool found_existing_prefix = false;
+
+  while (true) {
+    std::error_code ec;
+    const bool exists = std::filesystem::exists(head, ec);
+    if (ec) {
+      return false;
+    }
+    if (exists) {
+      if (!TryGetFinalPathNt(head, canonical_head)) {
+        return false;
+      }
+      found_existing_prefix = true;
+      break;
+    }
+    if (head.empty()) {
+      break;
+    }
+    const auto parent = head.parent_path();
+    if (parent == head) {
+      break;
+    }
+    const auto leaf = head.filename();
+    if (!leaf.empty()) {
+      // path / empty would insert a trailing separator.
+      tail = tail.empty() ? leaf : (leaf / tail);
+    }
+    head = parent;
+  }
+
+  if (!found_existing_prefix) {
+    return false;
+  }
+
+  if (tail.empty()) {
+    result = std::move(canonical_head);
+  } else {
+    result = (canonical_head / tail).lexically_normal();
+  }
+  return true;
+}
+
+}  // namespace
+
+// On AppContainer, std::filesystem::weakly_canonical fails with ERROR_ACCESS_DENIED
+// because VOLUME_NAME_DOS goes through the Volume Mount Manager. Fall back to
+// VOLUME_NAME_NT, which preserves volume identity (cross-volume escape rejection in
+// ValidateExternalDataPath relies on this — do NOT use VOLUME_NAME_NONE).
+common::Status WindowsEnv::GetWeaklyCanonicalPath(
+    const PathString& path,
+    PathString& canonical_path) const {
+  std::filesystem::path fs_path{path};
+  std::error_code ec;
+  std::filesystem::path canonical = std::filesystem::weakly_canonical(fs_path, ec);
+  if (!ec) {
+    canonical_path = canonical.native();
+    return Status::OK();
+  }
+
+  if (ec.value() == ERROR_ACCESS_DENIED) {
+    std::filesystem::path fallback;
+    if (TryWeaklyCanonicalPathNtVolume(fs_path, fallback)) {
+      canonical_path = fallback.native();
+      return Status::OK();
+    }
+  }
+
+  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                         "Failed to get the weakly canonical path: ",
+                         ToUTF8String(path), " - ", ec.message());
+}
+
+namespace internal {
+bool WeaklyCanonicalPathNtVolumeFallbackForTesting(const std::filesystem::path& input,
+                                                   std::filesystem::path& result) {
+  return TryWeaklyCanonicalPathNtVolume(input, result);
+}
+}  // namespace internal
+
 // Return the path of the executable/shared library for the current running code. This is to make it
 // possible to load other shared libraries installed next to our core runtime code.
 PathString WindowsEnv::GetRuntimePath() const {
diff --git a/onnxruntime/core/platform/windows/env.h b/onnxruntime/core/platform/windows/env.h
index f118e05d42ead..df8a3e10d512a 100644
--- a/onnxruntime/core/platform/windows/env.h
+++ b/onnxruntime/core/platform/windows/env.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "core/platform/windows/telemetry.h"
 #include "core/common/inlined_containers.h"
 #include <Windows.h>
+#include <filesystem>
 
 namespace onnxruntime {
 
@@ -79,6 +80,7 @@ class WindowsEnv : public Env {
   common::Status FileOpenWr(const std::string& path, /*out*/ int& fd) const override;
   common::Status FileClose(int fd) const override;
   common::Status GetCanonicalPath(const PathString& path, PathString& canonical_path) const override;
+  common::Status GetWeaklyCanonicalPath(const PathString& path, PathString& canonical_path) const override;
   PathString GetRuntimePath() const override;
   Status LoadDynamicLibrary(const PathString& library_filename, bool /*global_symbols*/, void** handle) const override;
   Status UnloadDynamicLibrary(void* handle) const override;
@@ -141,4 +143,10 @@ class WindowsEnv : public Env {
   WindowsTelemetry telemetry_provider_;
 };
 
+namespace internal {
+// Test-only: exposes the AppContainer fallback used by WindowsEnv::GetWeaklyCanonicalPath.
+bool WeaklyCanonicalPathNtVolumeFallbackForTesting(const std::filesystem::path& input,
+                                                   std::filesystem::path& result);
+}  // namespace internal
+
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/nn/conv.cc b/onnxruntime/core/providers/cpu/nn/conv.cc
index e2db5ac238f52..87ce1b05caae2 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.cc
+++ b/onnxruntime/core/providers/cpu/nn/conv.cc
@@ -15,15 +15,53 @@
  */
 /* Modifications Copyright (c) Microsoft. */
 
+#include <vector>
+
 #include "core/providers/cpu/nn/conv.h"
 
 #include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/util/math_cpuonly.h"
 
+#if defined(USE_KLEIDIAI) && defined(__aarch64__) && defined(__linux__)
+#include "core/mlas/lib/kleidiai/mlasi_kleidiai.h"
+#endif
+
 namespace onnxruntime {
 using ConvPadVector = ConvAttributes::ConvPadVector;
 
+namespace {
+
+template <typename T>
+void ConvertNHWCToNCHW(const T* src, T* dst,
+                       int64_t n, int64_t c, int64_t h, int64_t w,
+                       concurrency::ThreadPool* thread_pool) {
+  const size_t n_count = narrow<size_t>(n);
+  const size_t c_count = narrow<size_t>(c);
+  const size_t hw = narrow<size_t>(SafeInt<int64_t>(h) * w);
+  for (size_t n_idx = 0; n_idx < n_count; ++n_idx) {
+    const size_t n_src_offset = SafeInt<size_t>(SafeInt<size_t>(n_idx) * hw) * c_count;
+    const size_t n_dst_offset = SafeInt<size_t>(SafeInt<size_t>(n_idx) * c_count) * hw;
+    MlasTranspose(src + n_src_offset, dst + n_dst_offset, hw, c_count, thread_pool);
+  }
+}
+
+template <typename T>
+void ConvertNCHWToNHWC(const T* src, T* dst,
+                       int64_t n, int64_t c, int64_t h, int64_t w,
+                       concurrency::ThreadPool* thread_pool) {
+  const size_t n_count = narrow<size_t>(n);
+  const size_t c_count = narrow<size_t>(c);
+  const size_t hw = narrow<size_t>(SafeInt<int64_t>(h) * w);
+  for (size_t n_idx = 0; n_idx < n_count; ++n_idx) {
+    const size_t n_src_offset = SafeInt<size_t>(SafeInt<size_t>(n_idx) * c_count) * hw;
+    const size_t n_dst_offset = SafeInt<size_t>(SafeInt<size_t>(n_idx) * hw) * c_count;
+    MlasTranspose(src + n_src_offset, dst + n_dst_offset, c_count, hw, thread_pool);
+  }
+}
+
+}  // namespace
+
 template <typename T>
 Status Conv<T>::Compute(OpKernelContext* context) const {
   const auto* X = context->Input<Tensor>(0);
@@ -153,20 +191,77 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
   return Status::OK();
 }
 
+#if defined(USE_KLEIDIAI) && defined(__aarch64__) && defined(__linux__)
+Status Conv<float>::EnsurePackedChannelsLastFilter(concurrency::ThreadPool* thread_pool,
+                                                   size_t filter_count_per_group,
+                                                   size_t input_channels_per_group,
+                                                   const TensorShapeVector& kernel_shape,
+                                                   const TensorShapeVector& dilations) const {
+  if (!can_cache_packed_filter_) {
+    return Status::OK();
+  }
+
+  std::call_once(packed_filter_once_, [&] {
+    packed_filter_status_ = Status::OK();
+
+    auto alloc = Info().GetAllocator(OrtMemTypeDefault);
+    if (alloc == nullptr) {
+      packed_filter_status_ = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                              "Failed to get allocator for cached KleidiAI packed filter.");
+      return;
+    }
+
+    packed_filter_group_stride_ =
+        ArmKleidiAI::MlasConvSymmetricChannelsLast2DFloatPackWSize(filter_count_per_group,
+                                                                   input_channels_per_group,
+                                                                   kernel_shape.data(),
+                                                                   dilations.data());
+    if (packed_filter_group_stride_ == 0) {
+      packed_filter_status_ = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                              "Failed to get KleidiAI packed filter size.");
+      return;
+    }
+
+    const size_t packed_filter_size =
+        packed_filter_group_stride_ * onnxruntime::narrow<size_t>(conv_attrs_.group);
+    packed_filter_ = IAllocator::MakeUniquePtr<void>(alloc, packed_filter_size, true);
+    memset(packed_filter_.get(), 0, packed_filter_size);
+
+    ArmKleidiAI::MlasConvSymmetricChannelsLast2DFloatPackW(filter_count_per_group,
+                                                           input_channels_per_group,
+                                                           kernel_shape.data(),
+                                                           dilations.data(),
+                                                           onnxruntime::narrow<size_t>(conv_attrs_.group),
+                                                           constant_filter_tensor_->Data<float>(),
+                                                           constant_bias_tensor_ ? constant_bias_tensor_->Data<float>() : nullptr,
+                                                           packed_filter_.get(),
+                                                           packed_filter_group_stride_,
+                                                           thread_pool);
+  });
+
+  return packed_filter_status_;
+}
+#endif
+
 Status Conv<float>::Compute(OpKernelContext* context) const {
   size_t num_inputs = OpKernel::Node().InputDefs().size();
   const Tensor* X = context->Input<Tensor>(0);
   const Tensor* W = context->Input<Tensor>(1);
   const Tensor* B = num_inputs >= 3 ? context->Input<Tensor>(2) : nullptr;
   const Tensor* Sum = num_inputs >= 4 ? context->Input<Tensor>(3) : nullptr;
+  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X->Shape(), W->Shape(), channels_last_));
   const int64_t N = X->Shape()[0];
-  const int64_t C = X->Shape()[1];
+  // If channels_last_ we should get the back dim for channels instead of [1]
+  const int64_t C = channels_last_ ? X->Shape().GetDims().back() : X->Shape()[1];
   const int64_t M = W->Shape()[0];
-  ORT_RETURN_IF_ERROR(conv_attrs_.ValidateInputShape(X, W));
 
-  // kernel_shape is an optional attribute and has to be inferred from W if not provided
   TensorShapeVector kernel_shape;
   ORT_RETURN_IF_ERROR(conv_attrs_.ComputeKernelShape(W->Shape(), kernel_shape));
+  const size_t kernel_rank = kernel_shape.size();
+
+  if (channels_last_) {
+    ORT_RETURN_IF_NOT(kernel_rank == 2, "Conv with channels_last layout currently supports 2D kernels.");
+  }
 
   ConvPadVector pads(conv_attrs_.pads);
   if (pads.empty()) {
@@ -182,12 +277,14 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   }
 
   TensorShapeVector Y_dims({N, M});
-  TensorShape input_shape = X->Shape().Slice(2);
+  TensorShape input_shape = channels_last_ ? X->Shape().Slice(1, 3) : X->Shape().Slice(2);
   ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+  if (channels_last_) {
+    Y_dims = {Y_dims[0], Y_dims[2], Y_dims[3], Y_dims[1]};
+  }
   Tensor* Y = context->Output(0, TensorShape(Y_dims));
-  TensorShape output_shape = Y->Shape().Slice(2);
+  TensorShape output_shape = channels_last_ ? TensorShape(Y_dims).Slice(1, 3) : Y->Shape().Slice(2);
 
-  // Bail out early if one of the dimensions is zero.
   if (Y->Shape().Size() == 0) {
     return Status::OK();
   }
@@ -198,23 +295,75 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
   auto Xdata = X->DataAsSpan<float>();
   const auto* Bdata = B != nullptr ? B->Data<float>() : nullptr;
   auto Ydata = Y->MutableDataAsSpan<float>();
-  // Check for the optional Conv/Sum fusion.
+  concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
+
+  const bool wants_channels_last = channels_last_;
+  const bool sum_present = Sum != nullptr;
+  std::array<size_t, 2> input_shape_size_t{};
+  std::array<size_t, 2> kernel_shape_size_t{};
+  std::array<size_t, 2> dilations_size_t{};
+  std::array<size_t, 4> pads_size_t{};
+  std::array<size_t, 2> strides_size_t{};
+  if (wants_channels_last) {
+    ORT_RETURN_IF_NOT(input_shape.NumDimensions() == 2, "Nhwc Conv fast-path expects 2D input shape.");
+    for (size_t i = 0; i < 2; ++i) {
+      input_shape_size_t[i] = narrow<size_t>(input_shape[i]);
+      kernel_shape_size_t[i] = narrow<size_t>(kernel_shape[i]);
+      dilations_size_t[i] = narrow<size_t>(dilations[i]);
+      strides_size_t[i] = narrow<size_t>(strides[i]);
+      pads_size_t[i] = narrow<size_t>(pads[i]);
+      pads_size_t[i + 2] = narrow<size_t>(pads[i + 2]);
+    }
+  }
+  const bool nhwc_fastpath =
+      wants_channels_last && !sum_present &&
+      MlasConvSupportsSymmetricChannelsLast2DFloatKernel(
+          kernel_rank,
+          narrow<size_t>(N),
+          narrow<size_t>(conv_attrs_.group),
+          input_shape_size_t.data(),
+          kernel_shape_size_t.data(),
+          dilations_size_t.data(),
+          pads_size_t.data(),
+          strides_size_t.data(),
+          narrow<size_t>(M / conv_attrs_.group),
+          /*Beta*/ 0.0f);
+
+#if defined(USE_KLEIDIAI) && defined(__aarch64__) && defined(__linux__)
+  if (nhwc_fastpath && can_cache_packed_filter_) {
+    ORT_RETURN_IF_ERROR(EnsurePackedChannelsLastFilter(thread_pool,
+                                                       narrow<size_t>(M / conv_attrs_.group),
+                                                       narrow<size_t>(C / conv_attrs_.group),
+                                                       kernel_shape,
+                                                       dilations));
+  }
+#endif
+
+  const bool manual_sum = wants_channels_last && !nhwc_fastpath && sum_present;
+  MLAS_ACTIVATION pre_sum_activation = activation_;
+  if (manual_sum) {
+    pre_sum_activation.ActivationKind = MlasIdentityActivation;
+  }
+
+  const float* sum_manual_data = nullptr;
+
   float Beta = 0.0f;
-  if (Sum != nullptr) {
+  if (sum_present) {
     const auto& sum_shape = Sum->Shape();
     ORT_RETURN_IF_NOT(Y->Shape() == sum_shape, "output and sum shape must match");
-    // If the output was not allocated inplace with the sum tensor, then copy here.
-    auto sum_data = Sum->DataAsSpan<float>();
-    if (Ydata.data() != sum_data.data()) {
-      gsl::copy(sum_data, Ydata);
+    if (manual_sum) {
+      sum_manual_data = Sum->Data<float>();
+    } else {
+      auto sum_span = Sum->DataAsSpan<float>();
+      if (Ydata.data() != sum_span.data()) {
+        gsl::copy(sum_span, Ydata);
+      }
+      Beta = 1.0f;
     }
-    Beta = 1.0f;
   }
-  const size_t kernel_rank = kernel_shape.size();
-  concurrency::ThreadPool* thread_pool = context->GetOperatorThreadPool();
 
   if (kernel_rank >= 1 && kernel_rank <= 3) {
-    MLAS_CONV_PARAMETERS Parameters;
+    MLAS_CONV_PARAMETERS Parameters{};
     Parameters.BackendKernelSelectorConfig = &mlas_backend_kernel_selector_config_;
 
     size_t WorkingBufferSize;
@@ -230,22 +379,90 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
                     strides.data(),
                     output_shape.GetDims().data(),
                     narrow<size_t>(M / conv_attrs_.group),
-                    &activation_,
+                    manual_sum ? &pre_sum_activation : &activation_,
                     &WorkingBufferSize,
-                    Beta,
+                    nhwc_fastpath,
+                    nhwc_fastpath ? 0.0f : Beta,
                     thread_pool);
 
-    auto* working_data = WorkingBufferSize > 0 ? alloc->Alloc(sizeof(float) * SafeInt<size_t>(WorkingBufferSize))
-                                               : nullptr;
-    BufferUniquePtr working_buffer(working_data, BufferDeleter(std::move(alloc)));
+#if defined(USE_KLEIDIAI) && defined(__aarch64__) && defined(__linux__)
+    if (nhwc_fastpath && packed_filter_ != nullptr) {
+      Parameters.FilterIsPacked = true;
+      Parameters.PackedFilter = packed_filter_.get();
+      Parameters.PackedFilterGroupStride = packed_filter_group_stride_;
+    }
+#endif
+
+    float* working_data = nullptr;
+    BufferUniquePtr working_buffer;
+    if (WorkingBufferSize > 0) {
+      working_data = static_cast<float*>(alloc->Alloc(sizeof(float) * SafeInt<size_t>(WorkingBufferSize)));
+      working_buffer = BufferUniquePtr(working_data, BufferDeleter(alloc));
+    }
+
+    float* output_compute = Ydata.data();
+    BufferUniquePtr output_temp;
+    if (wants_channels_last && !nhwc_fastpath) {
+      const SafeInt<size_t> output_compute_size =
+          SafeInt<size_t>(Y->Shape()[0]) * SafeInt<size_t>(M) *
+          SafeInt<size_t>(output_shape[0]) * SafeInt<size_t>(output_shape[1]);
+      float* temp_output = static_cast<float*>(alloc->Alloc(sizeof(float) * output_compute_size));
+      output_temp = BufferUniquePtr(temp_output, BufferDeleter(alloc));
+      output_compute = temp_output;
+    }
+
+    const float* input_compute = Xdata.data();
+    BufferUniquePtr input_temp;
+    if (wants_channels_last && !nhwc_fastpath) {
+      ORT_RETURN_IF_NOT(X->Shape().NumDimensions() == 4, "Nhwc fallback expects 4D input.");
+      const auto& x_dims = X->Shape().GetDims();
+      const int64_t input_n = x_dims[0];
+      const int64_t input_h = x_dims[1];
+      const int64_t input_w = x_dims[2];
+      const int64_t input_c = x_dims[3];
+      const SafeInt<size_t> input_elements = SafeInt<size_t>(X->Shape().Size());
+      float* temp_input = static_cast<float*>(alloc->Alloc(sizeof(float) * input_elements));
+      input_temp = BufferUniquePtr(temp_input, BufferDeleter(alloc));
+      ConvertNHWCToNCHW(X->Data<float>(), temp_input,
+                        input_n, input_c, input_h, input_w, thread_pool);
+      input_compute = temp_input;
+    }
 
     MlasConv(&Parameters,
-             Xdata.data(),
+             input_compute,
              W->Data<float>(),
              Bdata,
-             static_cast<float*>(working_buffer.get()),
-             Ydata.data(),
+             working_data,
+             output_compute,
              thread_pool);
+
+    if (wants_channels_last && !nhwc_fastpath) {
+      const auto& y_dims = Y->Shape().GetDims();
+      ORT_RETURN_IF_NOT(y_dims.size() == 4, "Nhwc fallback expects 4D output.");
+      if (manual_sum) {
+        const SafeInt<size_t> output_elements = SafeInt<size_t>(Y->Shape().Size());
+        float* sum_nchw = static_cast<float*>(alloc->Alloc(sizeof(float) * output_elements));
+        BufferUniquePtr sum_nchw_buffer(sum_nchw, BufferDeleter(alloc));
+        ConvertNHWCToNCHW(sum_manual_data,
+                          sum_nchw,
+                          y_dims[0], y_dims[3], y_dims[1], y_dims[2], thread_pool);
+
+        auto output_span = gsl::make_span(output_compute, static_cast<size_t>(output_elements));
+        auto sum_span = gsl::make_span(sum_nchw, static_cast<size_t>(output_elements));
+        for (size_t i = 0; i < output_span.size(); ++i) {
+          output_span[i] += sum_span[i];
+        }
+
+        const auto activation_rows = narrow<size_t>(SafeInt<int64_t>(y_dims[0]) * y_dims[3]);
+        const auto activation_cols = narrow<size_t>(output_shape.Size());
+        MlasActivation(&activation_, output_compute, nullptr, activation_rows,
+                       activation_cols, activation_cols);
+      }
+
+      ConvertNCHWToNHWC(output_compute,
+                        Ydata.data(),
+                        y_dims[0], y_dims[3], y_dims[1], y_dims[2], thread_pool);
+    }
   } else {
     const int64_t input_image_size = input_shape.Size();
     const int64_t output_image_size = output_shape.Size();
@@ -287,7 +504,8 @@ Status Conv<float>::Compute(OpKernelContext* context) const {
             &mlas_backend_kernel_selector_config_);
       }
 
-      MlasActivation(&activation_, Ydata.data(), Bdata, narrow<size_t>(M), narrow<size_t>(output_image_size), narrow<size_t>(output_image_size));
+      MlasActivation(&activation_, Ydata.data(), Bdata, narrow<size_t>(M),
+                     narrow<size_t>(output_image_size), narrow<size_t>(output_image_size));
 
       Xdata = Xdata.subspan(X_offset * conv_attrs_.group);
       Ydata = Ydata.subspan(Y_offset * conv_attrs_.group);
diff --git a/onnxruntime/core/providers/cpu/nn/conv.h b/onnxruntime/core/providers/cpu/nn/conv.h
index 8992af9792d13..1cbe417cdbd96 100644
--- a/onnxruntime/core/providers/cpu/nn/conv.h
+++ b/onnxruntime/core/providers/cpu/nn/conv.h
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <mutex>
+
 #include "core/framework/op_kernel.h"
 #include "core/providers/cpu/nn/conv_attributes.h"
 #include "core/providers/cpu/mlas_backend_kernel_selector_config_utils.h"
@@ -25,9 +27,23 @@ class Conv : public OpKernel {
 template <>
 class Conv<float> : public OpKernel {
  public:
-  Conv(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info) {
+  Conv(const OpKernelInfo& info) : OpKernel(info), conv_attrs_(info), channels_last_(info.GetKernelDef().OpName() == "NhwcFusedConv") {
     activation_.ActivationKind = MlasIdentityActivation;
     SetupMlasBackendKernelSelectorFromConfigOptions(mlas_backend_kernel_selector_config_, info.GetConfigOptions());
+
+#if defined(USE_KLEIDIAI) && defined(__aarch64__) && defined(__linux__)
+    if (channels_last_) {
+      const auto& input_defs = info.node().InputDefs();
+      const bool has_bias_input = input_defs.size() >= 3 && input_defs[2] != nullptr;
+      info.TryGetConstantInput(1, &constant_filter_tensor_);
+      if (has_bias_input) {
+        info.TryGetConstantInput(2, &constant_bias_tensor_);
+      }
+
+      can_cache_packed_filter_ =
+          constant_filter_tensor_ != nullptr && (!has_bias_input || constant_bias_tensor_ != nullptr);
+    }
+#endif
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -38,6 +54,24 @@ class Conv<float> : public OpKernel {
   MLAS_BACKEND_KERNEL_SELECTOR_CONFIG mlas_backend_kernel_selector_config_;
 
   ConvAttributes conv_attrs_;
+  bool channels_last_{false};
+
+#if defined(USE_KLEIDIAI) && defined(__aarch64__) && defined(__linux__)
+ private:
+  Status EnsurePackedChannelsLastFilter(concurrency::ThreadPool* thread_pool,
+                                        size_t filter_count_per_group,
+                                        size_t input_channels_per_group,
+                                        const TensorShapeVector& kernel_shape,
+                                        const TensorShapeVector& dilations) const;
+
+  const Tensor* constant_filter_tensor_{nullptr};
+  const Tensor* constant_bias_tensor_{nullptr};
+  bool can_cache_packed_filter_{false};
+  mutable std::once_flag packed_filter_once_;
+  mutable Status packed_filter_status_;
+  mutable IAllocatorUniquePtr<void> packed_filter_;
+  mutable size_t packed_filter_group_stride_{0};
+#endif
 };
 
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/cpu/tensor/expand.cc b/onnxruntime/core/providers/cpu/tensor/expand.cc
index b0c636281bc7a..6d299282f3e60 100644
--- a/onnxruntime/core/providers/cpu/tensor/expand.cc
+++ b/onnxruntime/core/providers/cpu/tensor/expand.cc
@@ -94,8 +94,8 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
     auto input_dim = input_dims_iter > -1 ? input_dims[input_dims_iter] : 1;
     auto output_dim = output_dims[output_dims_iter];
 
-    input_count *= input_dim;
-    output_count *= output_dim;
+    input_count = SafeInt<int64_t>(input_count) * input_dim;
+    output_count = SafeInt<int64_t>(output_count) * output_dim;
 
     if (0 == input_count || 0 == output_count) {
       return Status::OK();
@@ -106,26 +106,26 @@ Status Expand<T>::Compute(OpKernelContext* context) const {
       input_dim_group[onnxruntime::narrow<size_t>(dim_group_start)] = input_count;
       output_dim_group[onnxruntime::narrow<size_t>(dim_group_start)] = output_count;
       expand_dim_size[onnxruntime::narrow<size_t>(dim_group_start)] = output_count / input_count / last_dim_size;
-      last_dim_size *= expand_dim_size[onnxruntime::narrow<size_t>(dim_group_start)];
+      last_dim_size = SafeInt<int64_t>(last_dim_size) * expand_dim_size[onnxruntime::narrow<size_t>(dim_group_start)];
     }
   }
 
   auto distribute_count = input_dim_group[onnxruntime::narrow<size_t>(dim_group_start)] / input_dim_group[SafeInt<size_t>(max_dims_size) - 1];
   std::vector<int64_t> output_offsets(onnxruntime::narrow<size_t>(distribute_count), 0);
   int64_t copy_len = input_dim_group[SafeInt<size_t>(max_dims_size) - 1];
-  auto copy_byte = copy_len * sizeof(T);
+  size_t copy_byte = SafeInt<size_t>(copy_len) * sizeof(T);
 
   auto distribute_fn =
       [&](ptrdiff_t i_start, ptrdiff_t i_end) {
         for (auto i = i_start; i < i_end; i++) {
-          auto input_offset = i * copy_len;
+          int64_t input_offset = SafeInt<int64_t>(i) * copy_len;
           int64_t output_offset = 0;
           for (auto j = dim_group_start + 1, remains = input_offset; j < max_dims_size; ++j) {
             auto current_count = remains / input_dim_group[onnxruntime::narrow<size_t>(j)];
-            output_offset += current_count * output_dim_group[onnxruntime::narrow<size_t>(j)];
+            output_offset = SafeInt<int64_t>(output_offset) + SafeInt<int64_t>(current_count) * output_dim_group[onnxruntime::narrow<size_t>(j)];
             remains = remains % input_dim_group[onnxruntime::narrow<size_t>(j)];
           }  // for j
-          memcpy(output_data + output_offset, input_data + input_offset, onnxruntime::narrow<size_t>(copy_byte));
+          memcpy(output_data + output_offset, input_data + input_offset, copy_byte);
           output_offsets[onnxruntime::narrow<size_t>(i)] = output_offset;
         }  // for i
       };  // distribute_fn
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 9241e35616ba2..d9b5760848678 100755
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -1409,9 +1409,9 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kO
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, double, GRU);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, MLFloat16, GRU);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Identity);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, LSTM);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM);
-class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, float, LSTM);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, double, LSTM);
+class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, MLFloat16, LSTM);
 class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Reshape);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, float, RNN);
 class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, double, RNN);
@@ -1702,6 +1702,9 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, HardSwish);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, LSTM);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, LSTM);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, LSTM);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, RandomNormal);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, RandomNormalLike);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, RandomUniform);
@@ -2681,9 +2684,9 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, float, GRU)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, double, GRU)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, MLFloat16, GRU)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, float, LSTM)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, double, LSTM)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, MLFloat16, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, float, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, double, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, MLFloat16, LSTM)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 18, Reshape)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, float, RNN)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 14, 21, double, RNN)>,
@@ -2975,6 +2978,9 @@ static Status RegisterCudaKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, HardSwish)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, HardSwish)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, BFloat16, HardSwish)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, float, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, double, LSTM)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, MLFloat16, LSTM)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, RandomNormal)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, RandomNormalLike)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 22, RandomUniform)>,
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
index e7c8f52950141..62ec59c0ef798 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.cc
@@ -4,6 +4,7 @@
 #include "core/providers/shared_library/provider_api.h"
 #include "cudnn_rnn_base.h"
 #include "rnn_impl.h"
+#include "core/common/safeint.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -16,7 +17,7 @@ Status CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
                                       const void* reorganized_w_data,
                                       const int lin_layer_id,
                                       const T* pos,
-                                      int& offset,
+                                      size_t& offset,
                                       bool is_matrix,
                                       cudaStream_t cuda_stream) const {
   int numDims;
@@ -37,12 +38,12 @@ Status CudnnRnnBase<T>::SetWeightBias(const cudnnHandle_t handle,
       is_matrix ? tensor_desc_matrix : tensor_desc_bias, 3, &dt, &numDims, matDims.data(), strideA.data()));
 
   mem_offset = is_matrix ? mem_offset_matrix : mem_offset_bias;
-  int count = matDims[0] * matDims[1] * matDims[2];
+  size_t count = SafeInt<size_t>(matDims[0]) * matDims[1] * matDims[2];
 
-  if (strideA[0] != count) {
+  if (static_cast<size_t>(strideA[0]) != count) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::INVALID_ARGUMENT, "Stride is not packed");
   }
-  CUDA_CALL_THROW(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, cuda_stream));
+  CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(mem_offset, pos + offset, count * sizeof(T), cudaMemcpyDeviceToDevice, cuda_stream));
 
   offset += count;
 
@@ -57,9 +58,9 @@ Status CudnnRnnBase<T>::SetCudnnRnnWeightBias(const cudnnHandle_t cudnn_handle,
                                               const T* R_data,
                                               const T* B_data,
                                               cudaStream_t cuda_stream) const {
-  int w_offset = 0;
-  int r_offset = 0;
-  int bias_offset = 0;
+  size_t w_offset = 0;
+  size_t r_offset = 0;
+  size_t bias_offset = 0;
   for (int layer = 0; layer < RNN_NUM_LAYERS * num_directions_; ++layer) {
     for (size_t idx = 0; idx < W_lin_layer_id_.size(); ++idx) {
       ORT_RETURN_IF_ERROR(SetWeightBias(
@@ -92,6 +93,12 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
                                           void* alloc_stream, cudaStream_t cuda_stream,
                                           cudnnHandle_t cudnn_handle) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
+  ORT_RETURN_IF(W->Shape().NumDimensions() != 3,
+                "Weight W must be 3-D [num_directions, hidden_size, input_size], got rank ",
+                W->Shape().NumDimensions());
+  ORT_RETURN_IF(R->Shape().NumDimensions() != 3,
+                "Recurrence R must be 3-D [num_directions, hidden_size, hidden_size], got rank ",
+                R->Shape().NumDimensions());
   int64_t input_size = W->Shape()[2];
   // RNN W[num_directions_, hidden_size_, input_size]
   // RNN R[num_directions_, hidden_size_, hidden_size_]
@@ -103,12 +110,12 @@ Status CudnnRnnBase<T>::ReorganizeWeights(const Tensor* W, const Tensor* R, cons
   // LSTM R[num_directions_, 4*hidden_size_, hidden_size_]
   // LSTM B[num_directions_, 8*hidden_size_]
   size_t number = W_lin_layer_id_.size();
-  int64_t w_size = num_directions_ * (number * hidden_size_ * (input_size + hidden_size_ + 2));
+  int64_t w_size = SafeInt<int64_t>(num_directions_) * number * hidden_size_ * (input_size + hidden_size_ + 2);
   TensorShapeVector dims_w({w_size, 1, 1});
   ORT_RETURN_IF_ERROR(target_w_desc.Set(dims_w, CudnnTensor::GetDataType<CudaT>()));
 
   // Prepare the weight data
-  reorganized_w_data_size_in_bytes = w_size * sizeof(T);
+  reorganized_w_data_size_in_bytes = SafeInt<size_t>(w_size) * sizeof(T);
   reorganized_w_data = GetScratchBuffer<void>(reorganized_w_data_size_in_bytes, alloc_stream);
 
   // In many cases, this allocation is bigger than needed, leaving part of
@@ -142,6 +149,8 @@ Status CudnnRnnBase<T>::CacheCudnnRnnWeights(const OpKernelInfo& info) {
   bool has_bias = B != nullptr;
 
   if (get_W && get_R) {
+    ORT_RETURN_IF(W->Shape().NumDimensions() != 3,
+                  "Constant W must be 3-D, got rank ", W->Shape().NumDimensions());
     CudnnRNN tmp_rnn_desc;
     auto proj_size = hidden_size_;
     ORT_RETURN_IF_ERROR(tmp_rnn_desc.Set(W->Shape()[2],  // input_size
@@ -163,7 +172,7 @@ Status CudnnRnnBase<T>::CacheCudnnRnnWeights(const OpKernelInfo& info) {
                                             w_data_cache_size_in_bytes_, w_data_cache_, w_desc_cache_,
                                             tmp_rnn_desc, nullptr, nullptr, DefaultCudnnHandle()));
     }
-    cudaStreamSynchronize(nullptr);
+    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(nullptr));
 
     weight_cached_ = true;
   }
@@ -177,6 +186,9 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   // inputs
   const Tensor* X = ctx->Input<Tensor>(RNN_Input_Index::X);  // inputs. [seq_length, batch_size, input_size]
   ORT_ENFORCE(nullptr != X);
+  ORT_RETURN_IF(X->Shape().NumDimensions() != 3,
+                "Input X must be 3-D [seq_length, batch_size, input_size], got rank ",
+                X->Shape().NumDimensions());
 
   // optional inputs
   // [batch_size]
@@ -187,6 +199,10 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   if (rnn_mode_ == CUDNN_LSTM) {
     // initial cell. [num_directions_, batch_size, hidden_size_]
     initial_c = ctx->Input<Tensor>(RNN_Input_Index::initial_c);
+    // cuDNN LSTM does not support peephole weights (ONNX input P at index 7)
+    const Tensor* P = ctx->Input<Tensor>(7);
+    ORT_RETURN_IF(P != nullptr,
+                  "CUDA LSTM does not support peephole weights (input P). Use CPU EP instead.");
   }
 
   size_t proj_size = hidden_size_;
@@ -197,7 +213,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   // we thread a single input as sequence_lens of length 1, require to expand to [batch_size]?
   std::vector<int32_t> sequence_lengths_temp;
   if (!sequence_lens) {
-    sequence_lengths_temp.resize(batch_size, gsl::narrow_cast<int32_t>(seq_length));
+    sequence_lengths_temp.resize(batch_size, gsl::narrow<int32_t>(seq_length));
   }
 
   const int32_t* sequence_lens_data = (sequence_lens == nullptr)
@@ -214,10 +230,10 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   // 0-len sequences are not supported by cuDNN.
   // Replace them by sequences of len 1 and mask them out with SetZeroSequences
-  for (int i = 0; i < batch_size; ++i) {
+  for (int64_t i = 0; i < batch_size; ++i) {
     if (0 == sequence_lens_data[i]) {
       seq_len_array[i] = 1;
-      zero_seq_index_cache[zero_seq_count] = i;
+      zero_seq_index_cache[zero_seq_count] = gsl::narrow<int32_t>(i);
       ++zero_seq_count;
     } else {
       seq_len_array[i] = sequence_lens_data[i];
@@ -257,15 +273,15 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   const T* x_data = X->Data<T>();
   if (reverse_) {
     // reverse input data
-    x_reversed_data = GetScratchBuffer<T>(seq_length * batch_size * input_size, GetComputeStream(ctx));
+    x_reversed_data = GetScratchBuffer<T>(SafeInt<int64_t>(seq_length) * batch_size * input_size, GetComputeStream(ctx));
     ReverseBySequence(Stream(ctx),
-                      gsl::narrow_cast<int32_t>(seq_length),
+                      gsl::narrow<int32_t>(seq_length),
                       sequence_lens_buffer.GpuPtr(),
-                      gsl::narrow_cast<int32_t>(batch_size),
-                      gsl::narrow_cast<int32_t>(input_size),
+                      gsl::narrow<int32_t>(batch_size),
+                      gsl::narrow<int32_t>(input_size),
                       reinterpret_cast<const CudaT*>(x_data),
                       reinterpret_cast<CudaT*>(x_reversed_data.get()),
-                      seq_length * batch_size * input_size);
+                      SafeInt<int64_t>(seq_length) * batch_size * input_size);
   }
 
   const T* x_data_input = reverse_ ? x_reversed_data.get() : x_data;
@@ -274,7 +290,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
   const T* cx_data = (initial_c == nullptr) ? nullptr : initial_c->Data<T>();
   T* y_h_data = (Y_h == nullptr) ? nullptr : Y_h->MutableData<T>();
   T* y_c_data = (Y_c == nullptr) ? nullptr : Y_c->MutableData<T>();
-  int64_t output_size = seq_length * num_directions_ * batch_size * hidden_size_;
+  int64_t output_size = SafeInt<int64_t>(seq_length) * num_directions_ * batch_size * hidden_size_;
   T* y_data = nullptr;
   IAllocatorUniquePtr<T> y_alloc_data;
   if (Y != nullptr) {
@@ -357,7 +373,8 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     // Mask on output for 0 sequence batches
     if (zero_seq_count > 0) {
       // Mask on output for 0 sequence batches
-      SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, GetComputeStream(ctx), Stream(ctx));
+      SetZeroSequences(gsl::span<const int32_t>(zero_seq_index_cache.data(), zero_seq_count),
+                       y_data, y_h_data, y_c_data, GetComputeStream(ctx), Stream(ctx));
     }
     return Status::OK();
   }
@@ -369,18 +386,18 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
     if (reverse_) {
       // reverse output data
       ReverseBySequence(Stream(ctx),
-                        gsl::narrow_cast<int32_t>(seq_length),
+                        gsl::narrow<int32_t>(seq_length),
                         sequence_lens_buffer.GpuPtr(),
-                        gsl::narrow_cast<int32_t>(batch_size),
-                        gsl::narrow_cast<int32_t>(hidden_size_),
+                        gsl::narrow<int32_t>(batch_size),
+                        gsl::narrow<int32_t>(hidden_size_),
                         reinterpret_cast<CudaT*>(y_data),
                         reinterpret_cast<CudaT*>(y_reorganized_data.get()),
                         output_size);
     } else {
       ReorderBidirectionalDataInSequence(Stream(ctx),
-                                         gsl::narrow_cast<int32_t>(seq_length),
-                                         gsl::narrow_cast<int32_t>(batch_size),
-                                         gsl::narrow_cast<int32_t>(hidden_size_),
+                                         gsl::narrow<int32_t>(seq_length),
+                                         gsl::narrow<int32_t>(batch_size),
+                                         gsl::narrow<int32_t>(hidden_size_),
                                          reinterpret_cast<CudaT*>(y_data),
                                          reinterpret_cast<CudaT*>(y_reorganized_data.get()),
                                          output_size);
@@ -388,7 +405,7 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
 
     if (Y != nullptr) {
       // User specified this optional output, so need to copy the reversed data to original place
-      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), output_size * sizeof(T),
+      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(y_data, y_reorganized_data.get(), SafeInt<size_t>(output_size) * sizeof(T),
                                            cudaMemcpyDeviceToDevice, Stream(ctx)));
     } else {
       y_data = y_reorganized_data.get();
@@ -397,26 +414,27 @@ Status CudnnRnnBase<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   // Mask on output for 0 sequence batches
   if (zero_seq_count > 0) {
-    SetZeroSequences(zero_seq_count, zero_seq_index_cache, y_data, y_h_data, y_c_data, GetComputeStream(ctx), Stream(ctx));
+    SetZeroSequences(gsl::span<const int32_t>(zero_seq_index_cache.data(), zero_seq_count),
+                     y_data, y_h_data, y_c_data, GetComputeStream(ctx), Stream(ctx));
   }
 
   return Status::OK();
 }
 
 template <typename T>
-void CudnnRnnBase<T>::SetZeroSequences(const int64_t zero_seq_index_cache_size,
-                                       const std::vector<int32_t> zero_seq_index_cache,
+void CudnnRnnBase<T>::SetZeroSequences(gsl::span<const int32_t> zero_seq_index_cache,
                                        T* y_data,
                                        T* y_h_data,
                                        T* y_c_data,
                                        void* alloc_stream, cudaStream_t cuda_stream) const {
   typedef typename ToCudaType<T>::MappedType CudaT;
+  const int64_t zero_seq_index_cache_size = static_cast<int64_t>(zero_seq_index_cache.size());
   CudaAsyncBuffer<int32_t> zero_seq_index_cache_async_buffer(this, zero_seq_index_cache_size);
   memcpy(zero_seq_index_cache_async_buffer.CpuPtr(), zero_seq_index_cache.data(),
          zero_seq_index_cache_size * sizeof(int32_t));
   ORT_THROW_IF_ERROR(zero_seq_index_cache_async_buffer.CopyToGpu(alloc_stream));
   MaskZeroSequences(cuda_stream,
-                    gsl::narrow_cast<int32_t>(hidden_size_),
+                    gsl::narrow<int32_t>(hidden_size_),
                     reinterpret_cast<CudaT*>(y_data),
                     reinterpret_cast<CudaT*>(y_h_data),
                     reinterpret_cast<CudaT*>(y_c_data),
diff --git a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
index b7a3d67b45e93..e7bccae2aad1c 100644
--- a/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
+++ b/onnxruntime/core/providers/cuda/rnn/cudnn_rnn_base.h
@@ -58,9 +58,9 @@ class CudnnRNN {
                                                    dataType,
                                                    dataType,
                                                    mathType,
-                                                   gsl::narrow_cast<int>(input_size),
-                                                   gsl::narrow_cast<int>(hidden_size),
-                                                   gsl::narrow_cast<int>(proj_size),  // projected size
+                                                   gsl::narrow<int>(input_size),
+                                                   gsl::narrow<int>(hidden_size),
+                                                   gsl::narrow<int>(proj_size),  // projected size
                                                    num_layers,
                                                    cudnn_dropout_desc,
                                                    // CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED works with CUDNN_RNN_PADDED_IO_ENABLED, so that it will auto fill 0 for the shorter sequences
@@ -148,12 +148,11 @@ class CudnnRnnBase : public CudaKernel {
                        const void* w_data,
                        const int lin_layer_id,
                        const T* pos,
-                       int& offset,
+                       size_t& offset,
                        bool is_matrix,
                        cudaStream_t cuda_stream) const;
 
-  void SetZeroSequences(const int64_t zero_seq_index_cache_size,
-                        const std::vector<int32_t> zero_seq_index_cache,
+  void SetZeroSequences(gsl::span<const int32_t> zero_seq_index_cache,
                         T* y_data,
                         T* y_h_data,
                         T* y_c_data,
diff --git a/onnxruntime/core/providers/cuda/rnn/lstm.cc b/onnxruntime/core/providers/cuda/rnn/lstm.cc
index 890d15cef6501..ce625371b3e5f 100644
--- a/onnxruntime/core/providers/cuda/rnn/lstm.cc
+++ b/onnxruntime/core/providers/cuda/rnn/lstm.cc
@@ -21,11 +21,25 @@ namespace cuda {
           .InputMemoryType(OrtMemTypeCPUInput, RNN_Input_Index::sequence_lens), \
       LSTM<T>);
 
+#define REGISTER_KERNEL_VERSIONED_TYPED_14(T)                                   \
+  ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX(                                      \
+      LSTM,                                                                     \
+      kOnnxDomain,                                                              \
+      14,                                                                       \
+      21,                                                                       \
+      T,                                                                        \
+      kCudaExecutionProvider,                                                   \
+      (*KernelDefBuilder::Create())                                             \
+          .TypeConstraint("T", DataTypeImpl::GetTensorType<T>())                \
+          .TypeConstraint("T1", DataTypeImpl::GetTensorType<int32_t>())         \
+          .InputMemoryType(OrtMemTypeCPUInput, RNN_Input_Index::sequence_lens), \
+      LSTM<T>);
+
 #define REGISTER_KERNEL_TYPED(T)                                                \
   ONNX_OPERATOR_TYPED_KERNEL_EX(                                                \
       LSTM,                                                                     \
       kOnnxDomain,                                                              \
-      14,                                                                       \
+      22,                                                                       \
       T,                                                                        \
       kCudaExecutionProvider,                                                   \
       (*KernelDefBuilder::Create())                                             \
@@ -38,6 +52,10 @@ REGISTER_KERNEL_VERSIONED_TYPED(float);
 REGISTER_KERNEL_VERSIONED_TYPED(double);
 REGISTER_KERNEL_VERSIONED_TYPED(MLFloat16);
 
+REGISTER_KERNEL_VERSIONED_TYPED_14(float);
+REGISTER_KERNEL_VERSIONED_TYPED_14(double);
+REGISTER_KERNEL_VERSIONED_TYPED_14(MLFloat16);
+
 REGISTER_KERNEL_TYPED(float);
 REGISTER_KERNEL_TYPED(double);
 REGISTER_KERNEL_TYPED(MLFloat16);
diff --git a/onnxruntime/core/providers/cuda/rnn/lstm.h b/onnxruntime/core/providers/cuda/rnn/lstm.h
index b82061088a2c8..fb1fdc7165725 100644
--- a/onnxruntime/core/providers/cuda/rnn/lstm.h
+++ b/onnxruntime/core/providers/cuda/rnn/lstm.h
@@ -14,6 +14,13 @@ class LSTM final : public CudnnRnnBase<T> {
   LSTM(const OpKernelInfo& info) : CudnnRnnBase<T>(info) {
     CudnnRnnBase<T>::SetRNNMode(CUDNN_LSTM);
 
+    // cuDNN LSTM does not support input_forget coupling (attribute added in opset 14)
+    int64_t input_forget = 0;
+    if (info.GetAttr("input_forget", &input_forget).IsOK()) {
+      ORT_ENFORCE(input_forget == 0,
+                  "CUDA LSTM does not support input_forget=1. Use CPU EP instead.");
+    }
+
     // ONNX W layout is W[iofc], WB[iofc], mapping to RNNLinLayerMatrixParams the linLayerID is 0, 3, 1, 2
     CudnnRnnBase<T>::W_lin_layer_id_.assign({0, 3, 1, 2});
     // ONNX R layout is R[iofc], RB[iofc], mapping to RNNLinLayerMatrixParams the linLayerID is 4, 7, 5, 6
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
index 4a6692778da0b..5758ff3ad2847 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc
@@ -664,8 +664,9 @@ Status QnnBackendManager::ReleaseDevice() {
 
 Status QnnBackendManager::InitializeProfiling() {
   profiling_level_merge_ = profiling_level_;
-  // use profiling level from ETW if ETW is enabled
-  if (profiling_level_etw_ != ProfilingLevel::INVALID) {
+  // Only use ETW level if it provides higher fidelity
+  if (profiling_level_etw_ != ProfilingLevel::INVALID &&
+      profiling_level_etw_ > profiling_level_) {
     profiling_level_merge_ = profiling_level_etw_;
   }
 
diff --git a/onnxruntime/core/util/math_cpu.cc b/onnxruntime/core/util/math_cpu.cc
index 3c7dc53b1098f..608d30c12b587 100644
--- a/onnxruntime/core/util/math_cpu.cc
+++ b/onnxruntime/core/util/math_cpu.cc
@@ -845,6 +845,7 @@ void Im2col<T, StorageOrder::NHWC>::operator()(
 template struct Im2col<int8_t, StorageOrder::NHWC>;
 template struct Im2col<uint8_t, StorageOrder::NHWC>;
 template struct Im2col<MLFloat16, StorageOrder::NHWC>;
+template struct Im2col<float, StorageOrder::NHWC>;
 
 template <>
 void Col2im<float, CPUMathUtil, StorageOrder::NCHW>(const float* data_col, int64_t channels, int64_t height,
diff --git a/onnxruntime/test/contrib_ops/fused_conv_test.cc b/onnxruntime/test/contrib_ops/fused_conv_test.cc
index 9df222db43501..7bfacb996526f 100644
--- a/onnxruntime/test/contrib_ops/fused_conv_test.cc
+++ b/onnxruntime/test/contrib_ops/fused_conv_test.cc
@@ -120,6 +120,58 @@ void RunConvOp(const ConvOpAndTestAttributes& attributes,
              disable_cpu, disable_cuda, disable_webgpu, use_float16, weight_is_initializer);
 }
 
+#ifdef USE_KLEIDIAI
+void TestNhwcFusedConvFloatOp(const ConvOpAndTestAttributes& attributes,
+                              const vector<vector<float>>& inputs,
+                              const vector<vector<int64_t>>& input_shapes,
+                              const std::initializer_list<float>& expected_output,
+                              const vector<int64_t>& expected_output_shape,
+                              bool weight_is_initializer = false) {
+  auto cpu_ep = DefaultCpuExecutionProvider();
+  if (cpu_ep == nullptr) {
+    return;
+  }
+
+  OpTester test("NhwcFusedConv", 1, onnxruntime::kMSDomain);
+  test.AddAttribute("group", attributes.group);
+  test.AddAttribute("kernel_shape", attributes.kernel_shape);
+  test.AddAttribute("activation", attributes.activation);
+
+  if (!attributes.dilations.empty()) {
+    test.AddAttribute("dilations", attributes.dilations);
+  }
+
+  if (!attributes.pads.empty()) {
+    test.AddAttribute("pads", attributes.pads);
+  } else {
+    test.AddAttribute("auto_pad", attributes.auto_pad);
+  }
+
+  if (!attributes.strides.empty()) {
+    test.AddAttribute("strides", attributes.strides);
+  }
+
+  if (!attributes.activation_parameters.empty()) {
+    test.AddAttribute("activation_params", attributes.activation_parameters);
+  }
+
+  const char* szNames[] = {"X", "W", "B", "Z"};
+  test.AddInput<float>(szNames[0], input_shapes[0], inputs[0]);
+  test.AddInput<float>(szNames[1], input_shapes[1], inputs[1], weight_is_initializer);
+  if (inputs.size() >= 3) {
+    test.AddInput<float>(szNames[2], input_shapes[2], inputs[2]);
+  }
+  if (inputs.size() >= 4) {
+    test.AddInput<float>(szNames[3], input_shapes[3], inputs[3]);
+  }
+  test.AddOutput<float>("Y", expected_output_shape, expected_output);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cpu_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+#endif
+
 TEST(FusedConvTest, Conv2D_HardSigmoid) {
   ConvOpAndTestAttributes attrs = {
       "",                           // auto_pad
@@ -235,6 +287,93 @@ TEST(FusedConvTest, Cpu_Conv2D_Bias_Z_Relu) {
   RunConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, false, true, true);
 }
 
+#ifdef USE_KLEIDIAI
+TEST(FusedConvTest, Cpu_NhwcConv2D_Bias_Z_Relu) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{2, 2},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      "Relu"                        // activation
+  };
+
+  vector<float> X = {1.0f, 2.0f, 3.0f,
+                     4.0f, 5.0f, 6.0f,
+                     7.0f, 8.0f, 9.0f};
+  vector<int64_t> X_shape = {1, 3, 3, 1};
+  vector<float> W = {1.0f, 1.0f, 1.0f, 1.0f,
+                     1.0f, 1.0f, 1.0f, 1.0f};
+  vector<int64_t> W_shape = {2, 1, 2, 2};
+  vector<int64_t> Y_shape = {1, 2, 2, 2};
+  vector<float> B = {1.0f, -1.0f};
+  vector<int64_t> B_shape = {2};
+  vector<float> Z = {-1.0f, 0.0f, 0.0f, 0.0f,
+                     0.0f, 0.0f, 0.0f, 1.0f};
+  vector<int64_t> Z_shape = {1, 2, 2, 2};
+  auto expected_vals = {12.0f, 11.0f, 17.0f, 15.0f, 25.0f, 23.0f, 29.0f, 28.0f};
+  TestNhwcFusedConvFloatOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape);
+  TestNhwcFusedConvFloatOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(FusedConvTest, Cpu_NhwcConv2D_Z_Relu_Batch2) {
+  ConvOpAndTestAttributes attrs = {
+      "",                           // auto_pad
+      vector<int64_t>{1, 1},        // dilations
+      1,                            // group
+      vector<int64_t>{1, 1},        // kernel_shape
+      vector<int64_t>{0, 0, 0, 0},  // pads
+      vector<int64_t>{1, 1},        // strides
+      "Relu"                        // activation
+  };
+
+  vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f,
+                     1.0f, 1.0f, 1.0f, 1.0f};
+  vector<int64_t> X_shape = {2, 2, 2, 1};
+  vector<float> W = {1.0f};
+  vector<int64_t> W_shape = {1, 1, 1, 1};
+  vector<float> B = {0.0f};
+  vector<int64_t> B_shape = {1};
+  vector<float> Z = {0.0f, 0.0f, 0.0f, 0.0f,
+                     -2.0f, -3.0f, -4.0f, -5.0f};
+  vector<int64_t> Z_shape = {2, 2, 2, 1};
+  vector<int64_t> Y_shape = {2, 2, 2, 1};
+  auto expected_vals = {1.0f, 2.0f, 3.0f, 4.0f,
+                        0.0f, 0.0f, 0.0f, 0.0f};
+
+  TestNhwcFusedConvFloatOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape);
+  TestNhwcFusedConvFloatOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, true);
+}
+
+TEST(FusedConvTest, Cpu_NhwcConv2D_AutoPadSameUpper) {
+  ConvOpAndTestAttributes attrs = {
+      "SAME_UPPER",           // auto_pad
+      vector<int64_t>{1, 1},  // dilations
+      1,                      // group
+      vector<int64_t>{3, 3},  // kernel_shape
+      {},                     // pads
+      vector<int64_t>{1, 1},  // strides
+      "Relu"                  // activation
+  };
+
+  vector<float> X(25, 1.0f);
+  vector<int64_t> X_shape = {1, 5, 5, 1};
+  vector<float> W = {0.0f, 1.0f, 2.0f,
+                     3.0f, 4.0f, 5.0f,
+                     6.0f, 7.0f, 8.0f};
+  vector<int64_t> W_shape = {1, 1, 3, 3};
+  vector<int64_t> Y_shape = {1, 5, 5, 1};
+  auto expected_vals = {24.0f, 33.0f, 33.0f, 33.0f, 20.0f,
+                        27.0f, 36.0f, 36.0f, 36.0f, 21.0f,
+                        27.0f, 36.0f, 36.0f, 36.0f, 21.0f,
+                        27.0f, 36.0f, 36.0f, 36.0f, 21.0f,
+                        12.0f, 15.0f, 15.0f, 15.0f, 8.0f};
+  TestNhwcFusedConvFloatOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape);
+  TestNhwcFusedConvFloatOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true);
+}
+#endif
+
 #endif
 
 }  // namespace test
diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
index 907cfd7fa0833..0e1b4b6bde720 100644
--- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
+++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc
@@ -3,6 +3,7 @@
 
 #include <algorithm>
 #include <cfenv>
+#include <limits>
 #include <vector>
 
 #include "gtest/gtest.h"
@@ -1162,5 +1163,163 @@ TEST(QAttentionTest, SharedPrepackedWeights) {
 }
 #endif
 
+namespace {
+
+// Build a small valid QAttention OpTester and let the caller mutate the
+// per-column weight_scale / weight_zero_point shapes to invalid sizes to
+// exercise the input-validation path that protects against OOB reads in
+// QAttention<T>::Compute.
+void RunQAttentionInvalidPerColumnShapes(const std::vector<int64_t>& weight_scale_dims,
+                                         const std::vector<float>& weight_scale_data,
+                                         const std::vector<int64_t>& weight_zp_dims,
+                                         const std::vector<uint8_t>& weight_zp_data,
+                                         const std::string& expected_error_substring) {
+  constexpr int batch_size = 1;
+  constexpr int sequence_length = 2;
+  constexpr int hidden_size = 4;
+  constexpr int number_of_heads = 2;
+
+  std::vector<int64_t> input_dims = {batch_size, sequence_length, hidden_size};
+  std::vector<int64_t> weights_dims = {hidden_size, 3 * hidden_size};
+  std::vector<int64_t> bias_dims = {3 * hidden_size};
+
+  std::vector<uint8_t> input_data(static_cast<size_t>(batch_size * sequence_length * hidden_size), 128);
+  std::vector<uint8_t> weight_data(static_cast<size_t>(hidden_size * 3 * hidden_size), 128);
+  std::vector<float> bias_data(static_cast<size_t>(3 * hidden_size), 0.0f);
+
+  OpTester tester("QAttention", 1, onnxruntime::kMSDomain);
+  tester.AddAttribute<int64_t>("num_heads", static_cast<int64_t>(number_of_heads));
+  tester.AddInput<uint8_t>("input", input_dims, input_data);
+  tester.AddInput<uint8_t>("weight", weights_dims, weight_data);
+  tester.AddInput<float>("bias", bias_dims, bias_data);
+  tester.AddInput<float>("input_scale", {1}, {0.1f});
+  tester.AddInput<float>("weight_scale", weight_scale_dims, weight_scale_data);
+  tester.AddOptionalInputEdge<int32_t>();  // mask_index
+  tester.AddInput<uint8_t>("input_zero_point", {1}, {128});
+  tester.AddInput<uint8_t>("weight_zero_point", weight_zp_dims, weight_zp_data);
+
+  // Output shape is required by OpTester even though we expect failure before
+  // the kernel writes anything meaningful.
+  std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+  std::vector<float> dummy_output(static_cast<size_t>(batch_size * sequence_length * hidden_size), 0.0f);
+  tester.AddOutput<float>("output", output_dims, dummy_output);
+
+  // CPU EP only — CUDA QAttention does not support per-column scale/zp.
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  tester.Run(OpTester::ExpectResult::kExpectFailure, expected_error_substring,
+             {}, nullptr, &execution_providers);
+}
+
+}  // namespace
+
+// Regression tests: a malformed model that supplies a "per-column"
+// weight_scale or weight_zero_point of the wrong size used to trigger
+// an out-of-bounds read in QAttention<T>::Compute. The kernel must
+// reject such inputs with a descriptive error.
+TEST(QAttentionTest, InvalidWeightScalePerColumnShape) {
+  // hidden_size=4, expected per-column size = 3 * hidden_size = 12.
+  // Supply 2 elements (smaller than expected) to trigger the validation.
+  RunQAttentionInvalidPerColumnShapes(
+      /*weight_scale_dims=*/{2}, /*weight_scale_data=*/{0.1f, 0.1f},
+      /*weight_zp_dims=*/{1}, /*weight_zp_data=*/{128},
+      /*expected_error_substring=*/"'weight_scale' must be a scalar or a 1D tensor of size 3 * hidden_size");
+}
+
+TEST(QAttentionTest, InvalidWeightScalePerColumnRank) {
+  // 2-D weight_scale with size 12 should still be rejected (rank must be 1).
+  RunQAttentionInvalidPerColumnShapes(
+      /*weight_scale_dims=*/{3, 4},
+      /*weight_scale_data=*/std::vector<float>(12, 0.1f),
+      /*weight_zp_dims=*/{1}, /*weight_zp_data=*/{128},
+      /*expected_error_substring=*/"'weight_scale' must be a scalar or a 1D tensor of size 3 * hidden_size");
+}
+
+TEST(QAttentionTest, InvalidWeightZeroPointPerColumnShape) {
+  // hidden_size=4, expected per-column size = 12. Supply 2 elements.
+  RunQAttentionInvalidPerColumnShapes(
+      /*weight_scale_dims=*/{1}, /*weight_scale_data=*/{0.1f},
+      /*weight_zp_dims=*/{2}, /*weight_zp_data=*/{128, 128},
+      /*expected_error_substring=*/"'weight_zero_point' must be a scalar or a 1D tensor of size 3 * hidden_size");
+}
+
+TEST(QAttentionTest, InvalidWeightZeroPointPerColumnRank) {
+  // 2-D weight_zero_point with size 12 should still be rejected (rank must be 1).
+  RunQAttentionInvalidPerColumnShapes(
+      /*weight_scale_dims=*/{1}, /*weight_scale_data=*/{0.1f},
+      /*weight_zp_dims=*/{3, 4},
+      /*weight_zp_data=*/std::vector<uint8_t>(12, 128),
+      /*expected_error_substring=*/"'weight_zero_point' must be a scalar or a 1D tensor of size 3 * hidden_size");
+}
+
+namespace {
+
+// Builds a QAttention OpTester with hidden_size = hidden_size_x3 / 3 and the given num_heads
+// attribute, and asserts that Run() fails with an error matching expected_error_substring. The
+// inputs are otherwise valid (per-tensor scales / zero points, no past, no mask).
+void RunQAttentionExpectFailure(int64_t hidden_size_x3,
+                                int64_t num_heads_attr,
+                                const std::string& expected_error_substring) {
+  constexpr int batch_size = 1;
+  constexpr int sequence_length = 2;
+  // input_hidden_size is the model's input embedding dimension; weights project it to 3 * hidden_size.
+  // It is independent of hidden_size, so use a distinct value here to make that clear.
+  constexpr int64_t input_hidden_size = 8;
+  const int64_t hidden_size = hidden_size_x3 / 3;
+
+  std::vector<int64_t> input_dims = {batch_size, sequence_length, input_hidden_size};
+  std::vector<int64_t> weights_dims = {input_hidden_size, hidden_size_x3};
+  std::vector<int64_t> bias_dims = {hidden_size_x3};
+
+  std::vector<uint8_t> input_data(static_cast<size_t>(batch_size * sequence_length * input_hidden_size), 128);
+  std::vector<uint8_t> weight_data(static_cast<size_t>(input_hidden_size * hidden_size_x3), 128);
+  std::vector<float> bias_data(static_cast<size_t>(hidden_size_x3), 0.0f);
+
+  OpTester tester("QAttention", 1, onnxruntime::kMSDomain);
+  tester.AddAttribute<int64_t>("num_heads", num_heads_attr);
+  tester.AddInput<uint8_t>("input", input_dims, input_data);
+  tester.AddInput<uint8_t>("weight", weights_dims, weight_data);
+  tester.AddInput<float>("bias", bias_dims, bias_data);
+  tester.AddInput<float>("input_scale", {1}, {0.1f});
+  tester.AddInput<float>("weight_scale", {1}, {0.1f});
+  tester.AddOptionalInputEdge<int32_t>();  // mask_index
+  tester.AddInput<uint8_t>("input_zero_point", {1}, {128});
+  tester.AddInput<uint8_t>("weight_zero_point", {1}, {128});
+
+  std::vector<int64_t> output_dims = {batch_size, sequence_length, hidden_size};
+  std::vector<float> dummy_output(static_cast<size_t>(batch_size * sequence_length * hidden_size), 0.0f);
+  tester.AddOutput<float>("output", output_dims, dummy_output);
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(DefaultCpuExecutionProvider());
+  tester.Run(OpTester::ExpectResult::kExpectFailure, expected_error_substring,
+             {}, nullptr, &execution_providers);
+}
+
+}  // namespace
+
+// Regression test: QAttention requires bias_dims[0] (== weights_dims[1]) to be a multiple of 3
+// since Q, K, V are packed along that dimension with equal hidden sizes.
+TEST(QAttentionTest, InvalidBiasDimNotMultipleOfThree) {
+  RunQAttentionExpectFailure(/*hidden_size_x3=*/13, /*num_heads_attr=*/1, "must be a multiple of 3");
+}
+
+// Regression test: hidden_size must be divisible by num_heads. Otherwise head_size silently
+// truncates and the per-head GEMM loop leaves part of the hidden dimension uncovered.
+TEST(QAttentionTest, InvalidHiddenSizeNotDivisibleByNumHeads) {
+  // hidden_size_x3 = 12 -> hidden_size = 4; 4 % 3 != 0.
+  RunQAttentionExpectFailure(/*hidden_size_x3=*/12, /*num_heads_attr=*/3,
+                             "hidden_size should be divisible by num_heads");
+}
+
+// Regression test: num_heads attribute exceeding INT_MAX must be rejected by the narrow<int>
+// conversion in AttentionBase's constructor rather than silently truncating. gsl::narrowing_error
+// is thrown during session init; its what() returns the literal "narrowing_error".
+TEST(QAttentionTest, InvalidNumHeadsOverflowsInt) {
+  RunQAttentionExpectFailure(/*hidden_size_x3=*/12,
+                             /*num_heads_attr=*/static_cast<int64_t>(std::numeric_limits<int>::max()) + 1,
+                             "narrowing_error");
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/framework/execution_frame_test.cc b/onnxruntime/test/framework/execution_frame_test.cc
index 4808d2bfb447a..bbfe22a2d4bc7 100644
--- a/onnxruntime/test/framework/execution_frame_test.cc
+++ b/onnxruntime/test/framework/execution_frame_test.cc
@@ -559,6 +559,141 @@ TEST(ExecutionFrameTestInit, InitializerAsOutput) {
   }
 }
 
+// Test that when a caller provides pre-allocated output OrtValues whose shapes don't match
+// the computed output shapes, ORT returns a clear INVALID_ARGUMENT error with an actionable
+// message. This is the scenario described in GitHub issue #28359.
+//
+// The caller's copy of the old output remains valid (OrtValue uses shared_ptr internally).
+// Pre-run validation (ValidateInputsOutputs) catches structural mismatches (wrong type, rank,
+// fixed dims). What remains at kernel execution time is purely dynamic dimension differences.
+TEST(ExecutionFrameTestInit, FetchWithMismatchedDynamicShapes) {
+  // Regression test for https://github.com/microsoft/onnxruntime/issues/28359
+  // Verifies that Run() returns a clear INVALID_ARGUMENT error when the caller provides
+  // a pre-allocated output OrtValue whose shape doesn't match the kernel's computed shape,
+  // and that pre-allocated outputs with matching shapes are used correctly.
+  SessionOptions so;
+  so.enable_mem_pattern = true;
+
+  InferenceSession session(so, GetEnvironment());
+
+  // Use Relu which preserves shape: output shape == input shape
+  onnxruntime::Model model("dynamic_shape_test", false, ModelMetaData(), PathString(),
+                           IOnnxRuntimeOpSchemaRegistryList(),
+                           {{kOnnxDomain, 12}}, {}, DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  TypeProto float_tensor;
+  float_tensor.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
+  // dynamic shape: {N, M}
+  auto* input_shape = float_tensor.mutable_tensor_type()->mutable_shape();
+  input_shape->add_dim()->set_dim_param("N");
+  input_shape->add_dim()->set_dim_param("M");
+
+  auto& input_arg = graph.GetOrCreateNodeArg("X", &float_tensor);
+  auto& output_arg = graph.GetOrCreateNodeArg("Y", &float_tensor);
+  graph.AddNode("relu", "Relu", "relu", {&input_arg}, {&output_arg});
+  graph.SetInputs({&input_arg});
+  graph.SetOutputs({&output_arg});
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::string serialized;
+  ASSERT_TRUE(model.ToProto().SerializeToString(&serialized));
+  std::istringstream model_stream(serialized);
+  ASSERT_STATUS_OK(session.Load(model_stream));
+  ASSERT_STATUS_OK(session.Initialize());
+
+  RunOptions ro;
+  auto allocator = test::AllocatorManager::Instance().GetAllocator(CPU);
+
+  // Run 1: pre-allocate the output buffer with the correct shape {2, 3}.
+  // The kernel should write into this buffer directly.
+  std::vector<float> input_data_1(6, 1.0f);
+  OrtValue input_1;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({2, 3}), input_data_1.data(),
+                       allocator->Info(), input_1);
+
+  OrtValue preallocated_output;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({2, 3}), allocator, preallocated_output);
+  const void* preallocated_buffer = preallocated_output.Get<Tensor>().DataRaw();
+
+  std::vector<OrtValue> results = {preallocated_output};
+  ASSERT_STATUS_OK(session.Run(ro,
+                               AsSpan({std::string("X")}), AsSpan({input_1}),
+                               AsSpan({std::string("Y")}), &results, nullptr));
+
+  ASSERT_EQ(results.size(), 1u);
+  ASSERT_TRUE(results[0].IsTensor());
+  EXPECT_EQ(results[0].Get<Tensor>().Shape(), TensorShape({2, 3}));
+
+  // The output should be in the pre-allocated buffer since shapes matched
+  EXPECT_EQ(results[0].Get<Tensor>().DataRaw(), preallocated_buffer);
+
+  // Verify Run 1 output correctness (Relu of all 1.0f = all 1.0f)
+  auto run1_data = results[0].Get<Tensor>().DataAsSpan<float>();
+  for (float v : run1_data) {
+    EXPECT_EQ(v, 1.0f);
+  }
+
+  // Run 2: different input shape {4, 5}, but results still contains the {2,3} output
+  // from Run 1. Run() should fail with INVALID_ARGUMENT because the pre-allocated
+  // output shape doesn't match the computed output shape.
+  std::vector<float> input_data_2(20, 2.0f);
+  OrtValue input_2;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({4, 5}), input_data_2.data(),
+                       allocator->Info(), input_2);
+
+  auto status = session.Run(ro,
+                            AsSpan({std::string("X")}), AsSpan({input_2}),
+                            AsSpan({std::string("Y")}), &results, nullptr);
+
+  ASSERT_FALSE(status.IsOK());
+  ASSERT_EQ(status.Code(), common::StatusCode::INVALID_ARGUMENT);
+  EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("pre-allocated"));
+  EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("{2,3}"));
+  EXPECT_THAT(status.ErrorMessage(), ::testing::HasSubstr("{4,5}"));
+
+  // Run 3: clear the output and retry — should succeed
+  results = {};
+  ASSERT_STATUS_OK(session.Run(ro,
+                               AsSpan({std::string("X")}), AsSpan({input_2}),
+                               AsSpan({std::string("Y")}), &results, nullptr));
+
+  ASSERT_EQ(results.size(), 1u);
+  ASSERT_TRUE(results[0].IsTensor());
+  EXPECT_EQ(results[0].Get<Tensor>().Shape(), TensorShape({4, 5}));
+
+  // Verify Run 3 output correctness (Relu of all 2.0f = all 2.0f)
+  auto run3_data = results[0].Get<Tensor>().DataAsSpan<float>();
+  for (float v : run3_data) {
+    EXPECT_EQ(v, 2.0f);
+  }
+
+  // Run 4: same shape {4, 5} with results carrying over — shapes match, should reuse
+  const void* run3_buffer = results[0].Get<Tensor>().DataRaw();
+
+  std::vector<float> input_data_4(20, 3.0f);
+  OrtValue input_4;
+  Tensor::InitOrtValue(DataTypeImpl::GetType<float>(), TensorShape({4, 5}), input_data_4.data(),
+                       allocator->Info(), input_4);
+
+  ASSERT_STATUS_OK(session.Run(ro,
+                               AsSpan({std::string("X")}), AsSpan({input_4}),
+                               AsSpan({std::string("Y")}), &results, nullptr));
+
+  ASSERT_EQ(results.size(), 1u);
+  ASSERT_TRUE(results[0].IsTensor());
+  EXPECT_EQ(results[0].Get<Tensor>().Shape(), TensorShape({4, 5}));
+
+  // Same shape — buffer should be reused
+  EXPECT_EQ(results[0].Get<Tensor>().DataRaw(), run3_buffer);
+
+  // Verify Run 4 output correctness (Relu of all 3.0f = all 3.0f)
+  auto run4_data = results[0].Get<Tensor>().DataAsSpan<float>();
+  for (float v : run4_data) {
+    EXPECT_EQ(v, 3.0f);
+  }
+}
+
 #if !defined(DISABLE_SPARSE_TENSORS)
 TEST(ExecutionFrameTestInit, SparseInitializerAsOutput) {
   constexpr std::array<int64_t, 2> dense_shape{3, 3};
diff --git a/onnxruntime/test/framework/kernel_type_str_resolver_utils_test.cc b/onnxruntime/test/framework/kernel_type_str_resolver_utils_test.cc
index 86ffef6c49dc9..32720e42988f6 100644
--- a/onnxruntime/test/framework/kernel_type_str_resolver_utils_test.cc
+++ b/onnxruntime/test/framework/kernel_type_str_resolver_utils_test.cc
@@ -9,18 +9,29 @@
 #include "gtest/gtest.h"
 
 #include "core/flatbuffers/schema/ort.fbs.h"
+#include "core/graph/constants.h"
+#include "core/graph/model.h"
 #include "core/graph/schema_registry.h"
+#include "core/optimizer/layout_transformation/layout_transformation_potentially_added_ops.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "test/test_environment.h"
 #include "test/util/include/asserts.h"
+#include "test/util/include/inference_session_wrapper.h"
+
+#include <filesystem>
+#include <fstream>
 
 namespace onnxruntime::test {
 
 static Status LoadLayoutTransformationRequiredOpsFromOpSchemas(KernelTypeStrResolver& kernel_type_str_resolver) {
-  const auto required_op_ids = kernel_type_str_resolver_utils::GetLayoutTransformationRequiredOpIdentifiers();
   const auto schema_registry = SchemaRegistryManager{};
-  for (const auto& op_id : required_op_ids) {
+  for (const auto& op_id : kLayoutTransformationPotentiallyAddedOps) {
     const auto* op_schema = schema_registry.GetSchema(std::string{op_id.op_type}, op_id.since_version,
                                                       std::string{op_id.domain});
-    ORT_RETURN_IF(op_schema == nullptr, "Failed to get op schema.");
+    ORT_RETURN_IF(op_schema == nullptr,
+                  "Failed to get op schema for domain='", op_id.domain,
+                  "', op_type='", op_id.op_type,
+                  "', since_version=", op_id.since_version, ".");
     ORT_RETURN_IF_ERROR(kernel_type_str_resolver.RegisterOpSchema(*op_schema));
   }
   return Status::OK();
@@ -49,6 +60,34 @@ TEST(KernelTypeStrResolverUtilsTest, VerifyLayoutTransformationRequiredOpsResolv
 #endif  // !defined(DISABLE_CONTRIB_OPS)
 }
 
+#if !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_CONTRIB_OPS)
+TEST(KernelTypeStrResolverUtilsTest, ResolveNhwcFusedConvFromLayoutTransformationRequiredOps) {
+  KernelTypeStrResolver resolver;
+  ASSERT_STATUS_OK(kernel_type_str_resolver_utils::AddLayoutTransformationRequiredOpsToKernelTypeStrResolver(resolver));
+
+  Model model("nhwc_fused_conv_layout_transform_resolver_test", false, DefaultLoggingManager().DefaultLogger());
+  auto& graph = model.MainGraph();
+
+  ONNX_NAMESPACE::TypeProto float_tensor;
+  auto* tensor_type = float_tensor.mutable_tensor_type();
+  tensor_type->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  tensor_type->mutable_shape()->add_dim()->set_dim_value(1);
+
+  auto& x = graph.GetOrCreateNodeArg("x", &float_tensor);
+  auto& w = graph.GetOrCreateNodeArg("w", &float_tensor);
+  auto& y = graph.GetOrCreateNodeArg("y", &float_tensor);
+
+  auto& nhwc_fused_conv = graph.AddNode(
+      "nhwc_fused_conv", "NhwcFusedConv", "test node", {&x, &w}, {&y}, nullptr, kMSDomain);
+  nhwc_fused_conv.SetSinceVersion(1);
+
+  gsl::span<const ArgTypeAndIndex> resolved_args;
+  ASSERT_STATUS_OK(resolver.ResolveKernelTypeStr(nhwc_fused_conv, "T", resolved_args));
+  ASSERT_FALSE(resolved_args.empty());
+}
+
+#endif  // !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_CONTRIB_OPS)
+
 // run this test manually to output a hard-coded byte array.
 // update AddLayoutTransformationRequiredOpsToKernelTypeStrResolver in
 // onnxruntime/core/framework/kernel_type_str_resolver_utils.cc
diff --git a/onnxruntime/test/framework/ort_model_only_test.cc b/onnxruntime/test/framework/ort_model_only_test.cc
index ec4f8967fd2a3..d711e4c47498d 100644
--- a/onnxruntime/test/framework/ort_model_only_test.cc
+++ b/onnxruntime/test/framework/ort_model_only_test.cc
@@ -17,6 +17,7 @@
 #include "test/util/include/asserts.h"
 #include "test/util/include/inference_session_wrapper.h"
 
+#include <filesystem>
 #include "flatbuffers/idl.h"
 #include "flatbuffers/util.h"
 
@@ -27,6 +28,7 @@ using namespace ONNX_NAMESPACE;
 
 namespace onnxruntime {
 namespace test {
+
 struct OrtModelTestInfo {
   std::basic_string<ORTCHAR_T> model_filename;
   std::string logid;
@@ -65,17 +67,20 @@ static void RunOrtModel(const OrtModelTestInfo& test_info) {
 
   std::vector<char> model_data;
   InferenceSessionWrapper session_object{so, GetEnvironment()};
+  std::filesystem::path model_path{test_info.model_filename};
+
+  const auto& model_path_str = model_path.native();
   if (test_info.run_use_buffer) {
     // Load the file into a buffer and use the buffer to create inference session
     size_t num_bytes = 0;
-    ASSERT_STATUS_OK(Env::Default().GetFileLength(test_info.model_filename.c_str(), num_bytes));
+    ASSERT_STATUS_OK(Env::Default().GetFileLength(model_path_str.c_str(), num_bytes));
     model_data.resize(num_bytes);
-    std::ifstream bytes_stream(test_info.model_filename, std::ifstream::in | std::ifstream::binary);
+    std::ifstream bytes_stream(model_path, std::ifstream::in | std::ifstream::binary);
     bytes_stream.read(model_data.data(), num_bytes);
     bytes_stream.close();
     ASSERT_STATUS_OK(session_object.Load(model_data.data(), static_cast<int>(num_bytes)));
   } else {
-    ASSERT_STATUS_OK(session_object.Load(test_info.model_filename));  // infer type from filename
+    ASSERT_STATUS_OK(session_object.Load(model_path_str));  // infer type from filename
   }
 
   ASSERT_STATUS_OK(session_object.Initialize());
@@ -151,7 +156,7 @@ static void CompareGraphAndSessionState(const InferenceSessionWrapper& session_o
 
   for (const auto& pair : i1) {
     auto iter = i2.find(pair.first);
-    ASSERT_NE(iter, i2.cend());
+    ASSERT_NE(iter, i2.cend()) << "Missing initializer " << pair.first;
 
     const OrtValue& left = pair.second;
     const OrtValue& right = iter->second;
@@ -218,10 +223,17 @@ static void CompareSessionMetadata(const InferenceSessionWrapper& session_object
 
 static void SaveAndCompareModels(const PathString& orig_file,
                                  const PathString& ort_file,
-                                 TransformerLevel optimization_level = TransformerLevel::Level3) {
+                                 TransformerLevel optimization_level = TransformerLevel::Level3,
+                                 bool compare_saved_model = true) {
+  std::filesystem::path orig_path{orig_file};
+  std::filesystem::path ort_path{ort_file};
+  if (ort_path.has_parent_path()) {
+    std::filesystem::create_directories(ort_path.parent_path());
+  }
+
   SessionOptions so;
   so.session_logid = "SerializeToOrtFormat";
-  so.optimized_model_filepath = ort_file;
+  so.optimized_model_filepath = ort_path.native();
   so.graph_optimization_level = optimization_level;
 
   // not strictly necessary - type should be inferred from the filename
@@ -229,7 +241,7 @@ static void SaveAndCompareModels(const PathString& orig_file,
   InferenceSessionWrapper session_object{so, GetEnvironment()};
 
   // create .ort file during Initialize due to values in SessionOptions
-  ASSERT_STATUS_OK(session_object.Load(orig_file));
+  ASSERT_STATUS_OK(session_object.Load(orig_path.native()));
   ASSERT_STATUS_OK(session_object.Initialize());
 
   SessionOptions so2;
@@ -240,9 +252,13 @@ static void SaveAndCompareModels(const PathString& orig_file,
 
   // load serialized version
   InferenceSessionWrapper session_object2{so2, GetEnvironment()};
-  ASSERT_STATUS_OK(session_object2.Load(ort_file));
+  ASSERT_STATUS_OK(session_object2.Load(ort_path.native()));
   ASSERT_STATUS_OK(session_object2.Initialize());
 
+  if (!compare_saved_model) {
+    return;
+  }
+
   CompareSessionMetadata(session_object, session_object2);
   CompareGraphAndSessionState(session_object, session_object2);
 }
@@ -368,7 +384,9 @@ void TestOrtModelUpdate(const PathString& onnx_file,
   // ort_file_v4 is ORT format model using v4 where we used kernel hashes instead of constraints
 
   // update v4 model and save as v5. do not run optimizations in order to preserve the model as-is.
-  SaveAndCompareModels(ort_file_v4, generated_ort_file_v5, TransformerLevel::Default);
+  // Loading a v4 ORT model updates it as part of deserialization, so the in-memory graph/session state is not
+  // expected to match a separately reloaded v5 model exactly. Just validate that we can save and reload it.
+  SaveAndCompareModels(ort_file_v4, generated_ort_file_v5, TransformerLevel::Default, false);
 
   // run the original, v4 and v5 models and check the output is the same
   OrtModelTestInfo test_info;
diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc
index 880208960a63c..71ac5b49e9718 100644
--- a/onnxruntime/test/framework/tensorutils_test.cc
+++ b/onnxruntime/test/framework/tensorutils_test.cc
@@ -21,6 +21,7 @@
 
 #ifdef _WIN32
 #include <Windows.h>
+#include "core/platform/windows/env.h"
 #endif
 
 using namespace ::onnxruntime::utils;
@@ -715,6 +716,125 @@ TEST_F(PathValidationTest, ValidateExternalDataPathEmptyModelPathWithSymlinkOuts
   EXPECT_THAT(status.ErrorMessage(), testing::HasSubstr("escapes working directory"));
 }
 
+#if defined(_WIN32)
+// Direct tests for the Windows AppContainer fallback used by
+// WindowsEnv::GetWeaklyCanonicalPath. The AppContainer trigger itself can't be
+// reproduced in a unit test environment; see microsoft/onnxruntime#28508.
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_ExistingDirectory) {
+  std::filesystem::path canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(base_dir_, canonical));
+
+  EXPECT_THAT(canonical.wstring(), testing::StartsWith(L"\\\\?\\GLOBALROOT\\Device\\"));
+
+  std::error_code ec;
+  EXPECT_TRUE(std::filesystem::exists(canonical, ec)) << "ec=" << ec.message();
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_ExistingFile) {
+  CreateEmptyFile(base_dir_ / "data.bin");
+
+  std::filesystem::path canonical;
+  ASSERT_TRUE(
+      onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(base_dir_ / "data.bin", canonical));
+
+  EXPECT_THAT(canonical.wstring(), testing::StartsWith(L"\\\\?\\GLOBALROOT\\Device\\"));
+
+  std::error_code ec;
+  EXPECT_TRUE(std::filesystem::exists(canonical, ec)) << "ec=" << ec.message();
+  EXPECT_TRUE(std::filesystem::is_regular_file(canonical, ec)) << "ec=" << ec.message();
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_NonExistentLeafLexicallyAppended) {
+  const std::filesystem::path leaf{L"does_not_exist.bin"};
+  std::filesystem::path canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(base_dir_ / leaf, canonical));
+
+  EXPECT_THAT(canonical.wstring(), testing::StartsWith(L"\\\\?\\GLOBALROOT\\Device\\"));
+  EXPECT_EQ(canonical.filename(), leaf);
+
+  // The canonicalized parent must be a path-component prefix of the result so that the
+  // containment check in ValidateExternalDataPath still works.
+  std::filesystem::path parent_canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(base_dir_, parent_canonical));
+  auto [parent_end, full_it] = std::mismatch(parent_canonical.begin(), parent_canonical.end(),
+                                             canonical.begin(), canonical.end());
+  EXPECT_EQ(parent_end, parent_canonical.end())
+      << "parent: " << parent_canonical << " full: " << canonical;
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_NonExistentMiddleAndLeaf) {
+  std::filesystem::path canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(
+      base_dir_ / L"missing_dir" / L"data.bin", canonical));
+
+  EXPECT_THAT(canonical.wstring(), testing::StartsWith(L"\\\\?\\GLOBALROOT\\Device\\"));
+  EXPECT_EQ(canonical.filename(), std::filesystem::path{L"data.bin"});
+  EXPECT_EQ(canonical.parent_path().filename(), std::filesystem::path{L"missing_dir"});
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_AllNonExistentReturnsFalse) {
+  // Synthetic absolute path on a non-existent volume. The fallback must return false so
+  // the caller surfaces the original weakly_canonical error rather than substituting an
+  // unverified path.
+  const std::filesystem::path bogus{L"\\\\?\\Volume{00000000-0000-0000-0000-000000000000}\\nope\\data.bin"};
+  std::filesystem::path canonical;
+  EXPECT_FALSE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(bogus, canonical));
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_MatchesWeaklyCanonicalAtFile) {
+  // Compare via std::filesystem::equivalent: the fallback returns the NT form
+  // (\\?\GLOBALROOT\Device\HarddiskVolumeN\...) while weakly_canonical returns the DOS
+  // form (C:\...), but both must point at the same file.
+  CreateEmptyFile(base_dir_ / "compare.bin");
+  const auto target = base_dir_ / "compare.bin";
+
+  std::error_code ec;
+  const auto reference = std::filesystem::weakly_canonical(target, ec);
+  ASSERT_FALSE(ec) << ec.message();
+
+  std::filesystem::path fallback;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(target, fallback));
+
+  EXPECT_TRUE(std::filesystem::equivalent(reference, fallback, ec))
+      << "reference=" << reference << " fallback=" << fallback << " ec=" << ec.message();
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_ResolvesSymlinks) {
+  const auto target = base_dir_ / "symlink_target.bin";
+  const auto link = base_dir_ / "symlink_link.bin";
+  try {
+    std::ofstream{target};
+    std::filesystem::create_symlink(target, link);
+  } catch (const std::exception& e) {
+    GTEST_SKIP() << "Symlink creation not supported in this environment: " << e.what();
+  }
+
+  std::filesystem::path link_canonical;
+  std::filesystem::path target_canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(link, link_canonical));
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(target, target_canonical));
+
+  std::error_code ec;
+  EXPECT_TRUE(std::filesystem::equivalent(link_canonical, target_canonical, ec))
+      << "link=" << link_canonical << " target=" << target_canonical << " ec=" << ec.message();
+}
+
+TEST_F(PathValidationTest, WeaklyCanonicalPathNtVolumeFallback_ResolvesDotDot) {
+  CreateDirectories(base_dir_ / "sub_for_dotdot");
+
+  std::filesystem::path canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(
+      base_dir_ / "sub_for_dotdot" / "..", canonical));
+
+  std::filesystem::path base_canonical;
+  ASSERT_TRUE(onnxruntime::internal::WeaklyCanonicalPathNtVolumeFallbackForTesting(base_dir_, base_canonical));
+
+  std::error_code ec;
+  EXPECT_TRUE(std::filesystem::equivalent(canonical, base_canonical, ec))
+      << "canonical=" << canonical << " base=" << base_canonical << " ec=" << ec.message();
+}
+#endif  // defined(_WIN32)
+
 TEST(TensorProtoUtilsTest, GetNodeProtoLayeringAnnotation) {
   // Case 1: Annotation exists
   {
diff --git a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
index 74a812062875a..5850afd2f84e8 100644
--- a/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
+++ b/onnxruntime/test/internal_testing_ep/internal_testing_tests.cc
@@ -22,6 +22,7 @@
 
 #include "gtest/gtest.h"
 #include "gmock/gmock.h"
+#include <filesystem>
 
 using namespace ONNX_NAMESPACE;
 using namespace onnxruntime::logging;
@@ -36,12 +37,36 @@ using namespace onnxruntime::internal_testing_ep;
 
 #define ORT_MODEL_FOLDER ORT_TSTR("testdata/")
 
+namespace {
+std::filesystem::path ResolveInternalTestPath(const std::filesystem::path& path) {
+  if (path.is_absolute() || path.empty()) {
+    return path;
+  }
+
+  std::filesystem::path candidate = std::filesystem::current_path() / path;
+  std::error_code ec;
+  if (std::filesystem::exists(candidate, ec)) {
+    return candidate;
+  }
+
+  static const std::filesystem::path kSourceTestRoot =
+      std::filesystem::path{ORT_TSTR_ON_MACRO(__FILE__)}.parent_path().parent_path();
+  return kSourceTestRoot / path;
+}
+
+std::basic_string<ORTCHAR_T> ResolveInternalTestPathString(const ORTCHAR_T* path) {
+  return ResolveInternalTestPath(std::filesystem::path{path}).native();
+}
+}  // namespace
+
 static Status CreateSession(const SessionOptions& so, std::unique_ptr<InferenceSessionWrapper>& session,
                             const ORTCHAR_T* model_path = ORT_MODEL_FOLDER "mnist.onnx",  // arbitrary test model
                             bool enable_custom_ep = true,
                             const std::unordered_set<std::string>* override_supported_ops = nullptr) {
   session = std::make_unique<InferenceSessionWrapper>(so, GetEnvironment());
 
+  std::filesystem::path resolved_model_path = ResolveInternalTestPath(std::filesystem::path{model_path});
+
   // set supported ops to ops that are ideally found consecutively in the model.
   // we can say the EP potentially handles them all, but can also test removing handling of one or more ops
   // at runtime to simulate a lower spec device where not all ops can be handled. this allows us to test
@@ -55,7 +80,7 @@ static Status CreateSession(const SessionOptions& so, std::unique_ptr<InferenceS
         std::make_unique<InternalTestingExecutionProvider>(*supported_ops)));
   }
 
-  ORT_RETURN_IF_ERROR(session->Load(model_path));
+  ORT_RETURN_IF_ERROR(session->Load(resolved_model_path.c_str()));
   ORT_RETURN_IF_ERROR(session->Initialize());
   return Status::OK();
 }
@@ -98,7 +123,9 @@ static void ExecuteMnist(InferenceSessionWrapper& session, bool custom_ep_enable
 
 #if !defined(ORT_MINIMAL_BUILD)
 TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.test_output.ort";
+  const auto ort_model_dir = ResolveInternalTestPath(std::filesystem::path{ORT_MODEL_FOLDER});
+  const std::basic_string<ORTCHAR_T> ort_model_path =
+      (ort_model_dir / ORT_TSTR("mnist.internal_testing_ep.test_output.ort")).native();
 
   //
   // First load the onnx format model and save as an ORT model.
@@ -121,10 +148,10 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
   so.optimized_model_filepath.clear();
   bool enable_custom_ep = false;
 
-  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path, enable_custom_ep));
+  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path.c_str(), enable_custom_ep));
   const auto& graph1 = session2->GetGraph();
-  // model should have all the original nodes and we should be able to execute with the fallback to CPU EP
-  ASSERT_EQ(graph1.NumberOfNodes(), num_nodes);
+  // ensure we can execute with the fallback to CPU EP even if additional nodes are introduced during loading
+  ASSERT_GE(graph1.NumberOfNodes(), num_nodes);
   ExecuteMnist(*session2, enable_custom_ep);
   session2 = nullptr;
 
@@ -133,7 +160,7 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
   // for the ORT format model.
   //
   enable_custom_ep = true;
-  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path, enable_custom_ep));
+  ASSERT_STATUS_OK(CreateSession(so, session2, ort_model_path.c_str(), enable_custom_ep));
   const auto& graph2 = session2->GetGraph();
   // model should be able to be loaded, and we should compile using custom ep. that will result in one node for the
   // custom EP (with Conv/Add/Relu/MaxPool), one for a reshape, and one for the fused MatMul+Add.
@@ -142,7 +169,7 @@ TEST(InternalTestingEP, TestSaveAndLoadOrtModel) {
 }
 
 TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
 
   // make sure we can't save a model with compiled ops. input/output model format doesn't matter
   SessionOptions so;
@@ -154,7 +181,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
   ASSERT_STATUS_OK(session->RegisterExecutionProvider(
       std::make_unique<InternalTestingExecutionProvider>(supported_ops)));
 
-  ASSERT_STATUS_OK(session->Load(ort_model_path));
+  ASSERT_STATUS_OK(session->Load(ort_model_path.c_str()));
   ASSERT_STATUS_NOT_OK_AND_HAS_SUBSTR(session->Initialize(),
                                       "Unable to serialize model as it contains compiled nodes");
 }
@@ -163,7 +190,7 @@ TEST(InternalTestingEP, PreventSaveOfModelWithCompiledOps) {
 // version of the ONNX operator when matching a static kernel, those are required.
 #if !defined(DISABLE_CONTRIB_OPS)
 TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "transform/fusion/conv_relu_opset12.onnx");
 
   SessionOptions so;
   InferenceSessionWrapper session(so, GetEnvironment());
@@ -175,7 +202,7 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
   ep->EnableStaticKernels();
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(ep)));
 
-  ASSERT_STATUS_OK(session.Load(ort_model_path));
+  ASSERT_STATUS_OK(session.Load(ort_model_path.c_str()));
   ASSERT_STATUS_OK(session.Initialize());
 
   TensorShape input_shape_x{1, 1, 7, 7};
@@ -204,7 +231,8 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) {
 
 TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
   auto run_test = [&](const ORTCHAR_T* model_path) {
-    SCOPED_TRACE("model path: " + ToUTF8String(model_path));
+    auto resolved_model_path = ResolveInternalTestPathString(model_path);
+    SCOPED_TRACE("model path: " + ToUTF8String(resolved_model_path.c_str()));
 
     SessionOptions so;
     // set this if you want to manually inspect the optimized model
@@ -218,7 +246,7 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
     ep->EnableStaticKernels();
     ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(ep)));
 
-    ASSERT_STATUS_OK(session.Load(model_path));
+    ASSERT_STATUS_OK(session.Load(resolved_model_path.c_str()));
     ASSERT_STATUS_OK(session.Initialize());
 
     const auto& graph = session.GetGraph();
@@ -249,13 +277,11 @@ TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) {
   };
 
   // the internal NHWC domain supports opset 11 and later
-  const ORTCHAR_T* onnx_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx";
-  run_test(onnx_model_path);
+  run_test(ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx");
 
   // Note: Using ORT format model with runtime optimizations so that the Conv nodes are preserved in the graph,
   // not converted into FusedConv nodes. The InternalTestingExecutionProvider handles Conv nodes.
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.with_runtime_opt.ort";
-  run_test(ort_model_path);
+  run_test(ORT_MODEL_FOLDER "squeezenet/model_opset11.with_runtime_opt.ort");
 }
 
 // make sure allocators returned by SessionState::GetAllocator are valid when IExecutionProvider::ReplaceAllocator
@@ -283,8 +309,8 @@ TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorag
     ASSERT_STATUS_OK(session.RegisterExecutionProvider(ep));
   }
 
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model.onnx";
-  ASSERT_STATUS_OK(session.Load(ort_model_path));
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "squeezenet/model.onnx");
+  ASSERT_STATUS_OK(session.Load(ort_model_path.c_str()));
   ASSERT_STATUS_OK(session.Initialize());
 
   // Need to undo the wrapping that happens in Environment::RegisterAllocator to be able to compare the pointers
@@ -301,25 +327,25 @@ TEST(InternalTestingEP, TestReplaceAllocatorDoesntBreakDueToLocalAllocatorStorag
 
 // test to validate a minimal build
 TEST(InternalTestingEP, TestLoadOrtModel) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
 
   std::unique_ptr<InferenceSessionWrapper> session;
   bool enable_custom_ep = true;
 
-  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep));
+  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path.c_str(), enable_custom_ep));
   ExecuteMnist(*session, enable_custom_ep);
 }
 
 // test that if the custom EP cannot take all nodes due to device limitations
 // that we fallback to the CPU implementations and can execute the model
 TEST(InternalTestingEP, TestLoadOrtModelWithReducedOpCoverage) {
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
   const std::unordered_set<std::string> supported_ops{"Conv", "Add", "Relu" /*, "MaxPool"*/};
 
   std::unique_ptr<InferenceSessionWrapper> session;
   bool enable_custom_ep = true;
 
-  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path, enable_custom_ep, &supported_ops));
+  ASSERT_STATUS_OK(CreateSession(SessionOptions{}, session, ort_model_path.c_str(), enable_custom_ep, &supported_ops));
 
   const auto& graph = session->GetGraph();
   // Conv+Add gets fused by level 1 optimizer into single node. The 'Conv'/'Add'/'Relu' nodes should be compiled and
@@ -454,7 +480,7 @@ TEST(InternalTestingEP, TestOrtModelWithCompileFailure) {
   // the layout transformation for this EP is already done at this stage and reverting
   // can result in more failures.
   // This is to test the model initialization fails if compile fails.
-  const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort";
+  const auto ort_model_path = ResolveInternalTestPathString(ORT_MODEL_FOLDER "mnist.internal_testing_ep.ort");
 
   const std::unordered_set<std::string>& supported_ops{"Conv", "Gemm"};
   const std::unordered_set<std::string>& compile_failure_ops{"Gemm"};
diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 4b6128f7e19da..9df09728ffa17 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -110,6 +110,7 @@ void SCONV_NCHW(benchmark::State& state, const char* /*dummy*/) {
                   static_cast<size_t>(output_channels_per_group),
                   &activation,
                   &WorkingBufferSize,
+                  false,
                   0.0f,
                   nullptr);
 
@@ -217,6 +218,7 @@ void SCONV_NCHW_THREADED(benchmark::State& state, const char* /*dummy*/) {
                   static_cast<size_t>(output_channels_per_group),
                   &activation,
                   &WorkingBufferSize,
+                  false,
                   0.0f,
                   tp);
 
diff --git a/onnxruntime/test/mlas/bench/bench_transcendental.cpp b/onnxruntime/test/mlas/bench/bench_transcendental.cpp
index f7e461c29843a..3d42c0f84e6cc 100644
--- a/onnxruntime/test/mlas/bench/bench_transcendental.cpp
+++ b/onnxruntime/test/mlas/bench/bench_transcendental.cpp
@@ -19,7 +19,9 @@ constexpr float kSiluMaxValue = 20.0f;
 constexpr float kGeluMinValue = -10.0f;
 constexpr float kGeluMaxValue = 10.0f;
 constexpr float kInvSqrt2 = 0.7071067811865475244f;
+#if defined(MLAS_TARGET_AMD64)
 constexpr int64_t kFusedBytesPerElement = 2;
+#endif
 constexpr int64_t kSiluUnfusedBytesPerElement = 5;
 constexpr int64_t kGeluUnfusedBytesPerElement = 7;
 
diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.h b/onnxruntime/test/mlas/unittest/test_conv2d.h
index 37a844fdb4b02..6ac47c69ae0b8 100644
--- a/onnxruntime/test/mlas/unittest/test_conv2d.h
+++ b/onnxruntime/test/mlas/unittest/test_conv2d.h
@@ -4,7 +4,6 @@
 #pragma once
 
 #include <cstring>
-#include <vector>
 
 #include "test_util.h"
 
@@ -12,10 +11,6 @@
 #include "core/mlas/lib/mlasi.h"
 #endif
 
-#if defined(USE_KLEIDIAI) && defined(MLAS_ENABLE_TEST_HOOKS)
-#include "core/mlas/lib/kleidiai/mlasi_kleidiai.h"
-#endif
-
 template <bool Threaded>
 class MlasConv2DTest : public MlasTestBase {
  protected:
@@ -75,6 +70,7 @@ class MlasConv2DTest : public MlasTestBase {
                     FilterCount,
                     &Activation,
                     &WorkingBufferSize,
+                    false,
                     Beta,
                     threadpool_);
 
@@ -308,168 +304,6 @@ class MlasConv2DTest : public MlasTestBase {
 
   MlasConv2DTest() : threadpool_(Threaded ? GetMlasThreadPool() : nullptr) {}
 
-#if defined(USE_KLEIDIAI) && defined(MLAS_ENABLE_TEST_HOOKS)
-  void TestKleidiAILhsCacheIgnoresInputContent() {
-    if (!ArmKleidiAI::UseSME) {
-      return;
-    }
-
-    struct ConvGeometry {
-      const char* name;
-      size_t input_channels;
-      size_t input_height;
-      size_t input_width;
-      size_t filter_count;
-      size_t kernel_height;
-      size_t kernel_width;
-      size_t padding;
-      size_t dilation_height;
-      size_t dilation_width;
-      size_t stride_height;
-      size_t stride_width;
-    };
-
-    constexpr size_t BatchCount = 1;
-    constexpr size_t GroupCount = 1;
-    constexpr ConvGeometry geometries[] = {
-        {"padded_3x3", 16, 11, 11, 32, 3, 3, 1, 1, 1, 1, 1},
-        {"strided_3x3", 24, 13, 9, 16, 3, 3, 1, 1, 1, 2, 2},
-        {"dilated_3x3", 8, 15, 13, 20, 3, 3, 2, 2, 2, 1, 1},
-        {"wide_kernel", 12, 10, 12, 24, 3, 5, 2, 1, 1, 1, 2},
-    };
-
-    const auto fill_input = [](std::vector<float>& input, size_t seed) {
-      for (size_t i = 0; i < input.size(); ++i) {
-        input[i] = static_cast<float>((static_cast<int>((i * (seed + 3)) % 31) - 15) * 0.075f +
-                                      static_cast<float>(seed) * 0.125f);
-      }
-    };
-
-    for (const auto& geometry : geometries) {
-      SCOPED_TRACE(geometry.name);
-
-      const int64_t output_height64 =
-          ((int64_t(geometry.input_height) + 2 * int64_t(geometry.padding)) -
-           (int64_t(geometry.dilation_height) * (int64_t(geometry.kernel_height) - 1) + 1)) /
-              int64_t(geometry.stride_height) +
-          1;
-      const int64_t output_width64 =
-          ((int64_t(geometry.input_width) + 2 * int64_t(geometry.padding)) -
-           (int64_t(geometry.dilation_width) * (int64_t(geometry.kernel_width) - 1) + 1)) /
-              int64_t(geometry.stride_width) +
-          1;
-
-      ASSERT_GT(output_height64, 0);
-      ASSERT_GT(output_width64, 0);
-
-      const size_t output_height = static_cast<size_t>(output_height64);
-      const size_t output_width = static_cast<size_t>(output_width64);
-      const size_t input_elements =
-          BatchCount * GroupCount * geometry.input_channels * geometry.input_height * geometry.input_width;
-      const size_t filter_elements = GroupCount * geometry.filter_count * geometry.input_channels *
-                                     geometry.kernel_height * geometry.kernel_width;
-      const size_t bias_elements = GroupCount * geometry.filter_count;
-      const size_t output_elements = BatchCount * GroupCount * geometry.filter_count * output_height * output_width;
-
-      std::vector<float> input_a(input_elements);
-      std::vector<float> input_a_copy(input_elements);
-      std::vector<float> input_b(input_elements);
-      std::vector<float> filter(filter_elements);
-      std::vector<float> bias(bias_elements);
-      std::vector<float> output(output_elements);
-      std::vector<float> output_reference(output_elements);
-
-      fill_input(input_a, 1);
-      input_a_copy = input_a;
-      fill_input(input_b, 2);
-
-      for (size_t i = 0; i < filter_elements; ++i) {
-        filter[i] = static_cast<float>((static_cast<int>((i * 5) % 23) - 11) * 0.05f);
-      }
-
-      for (size_t i = 0; i < bias_elements; ++i) {
-        bias[i] = static_cast<float>((static_cast<int>(i % 7) - 3) * 0.02f);
-      }
-
-      const auto verify_conv = [&](const std::vector<float>& input, const char* label) {
-        SCOPED_TRACE(label);
-
-        MlasConv2D(BatchCount,
-                   GroupCount,
-                   geometry.input_channels,
-                   geometry.input_height,
-                   geometry.input_width,
-                   geometry.filter_count,
-                   geometry.kernel_height,
-                   geometry.kernel_width,
-                   geometry.padding,
-                   geometry.padding,
-                   geometry.padding,
-                   geometry.padding,
-                   geometry.dilation_height,
-                   geometry.dilation_width,
-                   geometry.stride_height,
-                   geometry.stride_width,
-                   output_height,
-                   output_width,
-                   input.data(),
-                   filter.data(),
-                   bias.data(),
-                   output.data());
-
-        ReferenceConv2D(BatchCount,
-                        GroupCount,
-                        geometry.input_channels,
-                        geometry.input_height,
-                        geometry.input_width,
-                        geometry.filter_count,
-                        geometry.kernel_height,
-                        geometry.kernel_width,
-                        geometry.padding,
-                        geometry.padding,
-                        geometry.dilation_height,
-                        geometry.dilation_width,
-                        geometry.stride_height,
-                        geometry.stride_width,
-                        output_height,
-                        output_width,
-                        input.data(),
-                        filter.data(),
-                        bias.data(),
-                        output_reference.data());
-
-        for (size_t i = 0; i < output_elements; ++i) {
-          ASSERT_TRUE(CloseEnough(output[i], output_reference[i]))
-              << "Mismatch at output index " << i
-              << ": actual=" << output[i]
-              << ", expected=" << output_reference[i];
-        }
-      };
-
-      ArmKleidiAI::MlasConvClearLhsCacheForTest();
-      EXPECT_EQ(ArmKleidiAI::MlasConvLhsCacheEntryCountForTest(), size_t{0});
-
-      verify_conv(input_a, "initial_input");
-      EXPECT_EQ(ArmKleidiAI::MlasConvLhsCacheEntryCountForTest(), size_t{1});
-
-      verify_conv(input_a_copy, "same_content_different_buffer");
-      EXPECT_EQ(ArmKleidiAI::MlasConvLhsCacheEntryCountForTest(), size_t{1})
-          << "same geometry with a different input buffer should reuse the LHS indirection cache";
-
-      verify_conv(input_b, "different_content_different_buffer");
-      EXPECT_EQ(ArmKleidiAI::MlasConvLhsCacheEntryCountForTest(), size_t{1})
-          << "same geometry with different input content should reuse the LHS indirection cache";
-
-      fill_input(input_a, 3);
-      verify_conv(input_a, "different_content_same_buffer");
-      EXPECT_EQ(ArmKleidiAI::MlasConvLhsCacheEntryCountForTest(), size_t{1})
-          << "same geometry with mutated input content should not add cache entries";
-    }
-
-    ArmKleidiAI::MlasConvClearLhsCacheForTest();
-  }
-#endif
-
 #if defined(MLAS_TARGET_AMD64)
   void TestMobileClipAvx512DispatchSelection(size_t GroupCount,
                                              size_t InputHeight,
@@ -533,6 +367,7 @@ class MlasConv2DTest : public MlasTestBase {
                     FilterCount,
                     &Activation,
                     &WorkingBufferSize,
+                    false,
                     0.0f,
                     threadpool_);
 
@@ -845,9 +680,6 @@ class MlasConv2DTest : public MlasTestBase {
   }
 
   void ExecuteShort(void) override {
-#if defined(USE_KLEIDIAI) && defined(MLAS_ENABLE_TEST_HOOKS)
-    TestKleidiAILhsCacheIgnoresInputContent();
-#endif
 #if defined(MLAS_TARGET_AMD64)
     TestMobileClipAvx512DispatchSelection(64, 64, 64);
     TestMobileClipAvx512DispatchSelection(128, 32, 32);
diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc
index bb409a2bbb82e..03ca950050d64 100644
--- a/onnxruntime/test/optimizer/conv_add_act_test.cc
+++ b/onnxruntime/test/optimizer/conv_add_act_test.cc
@@ -31,8 +31,9 @@ void TestConvPath(const std::vector<int64_t>& input_shape, const std::vector<int
   auto check_graph = [&](InferenceSessionWrapper& session) {
     auto op_to_count = CountOpsInGraph(session.GetGraph());
     EXPECT_EQ(op_to_count["com.microsoft.FusedConv"], 1);
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], 0);
   };
-  InlinedHashSet<std::string> disabled_optimizers = {"NchwcTransformer"};
+  InlinedHashSet<std::string> disabled_optimizers = {"NchwcTransformer", "NhwcTransformer"};
   TransformerTester(build_test_case,
                     check_graph,
                     TransformerLevel::Default,
diff --git a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
index de973679c8f80..b1997701e132f 100644
--- a/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
+++ b/onnxruntime/test/optimizer/fuse_initializers_transformer_test.cc
@@ -363,6 +363,9 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_
 
   // Create session and check graph before / after initiation
   InferenceSessionWrapper session{so, GetEnvironment()};
+  // Keep this test focused on FuseInitializersTransformer/NCHWC behavior. NhwcTransformer is
+  // hardware/kernel dependent and can otherwise change the post-init node counts this test asserts on.
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   test_graph_structure_at_init(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
@@ -402,6 +405,9 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_
 
   // Create session and check graph before / after initiation
   InferenceSessionWrapper session{so, GetEnvironment()};
+  // Keep this test focused on FuseInitializersTransformer/NCHWC behavior. NhwcTransformer is
+  // hardware/kernel dependent and can otherwise change the post-init node counts this test asserts on.
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   test_graph_structure_at_init(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
@@ -443,6 +449,9 @@ TEST(TransformerTest, FuseFp16InitializersWithFp32Node_with_graph_optimizations_
 
   // Create session and check graph before / after initiation
   InferenceSessionWrapper session{so, GetEnvironment()};
+  // Keep this test focused on FuseInitializersTransformer/NCHWC behavior. NhwcTransformer is
+  // hardware/kernel dependent and can otherwise change the post-init node counts this test asserts on.
+  ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
   ASSERT_STATUS_OK(session.Load(model_uri));
   test_graph_structure_at_init(session.GetGraph());
   ASSERT_STATUS_OK(session.Initialize());
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 1357ede82017a..0e4ab5c2d3b73 100644
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -76,6 +76,7 @@
 #include "core/optimizer/rule_based_graph_transformer.h"
 #include "core/optimizer/slice_concat_to_space_to_depth_fusion.h"
 #include "core/optimizer/slice_elimination.h"
+#include "core/optimizer/stft_decomposition.h"
 #include "core/optimizer/unsqueeze_elimination.h"
 #include "core/optimizer/utils.h"
 #include "core/platform/env.h"
@@ -1521,6 +1522,201 @@ TEST_F(GraphTransformationTests, ConstantFoldingIfConstantInliningEdgesWithMiddl
   ASSERT_TRUE(dest_edges.find(2) != dest_edges.end());
 }
 
+// Test that constant folding respects the output size limit and skips nodes
+// whose output would exceed it. This is a security measure against malicious
+// models that could cause memory exhaustion during optimization.
+TEST_F(GraphTransformationTests, ConstantFoldingOutputSizeLimit) {
+  // Build a model with an Expand node: scalar input [1.0] expanded by shape [1024, 1024].
+  // Output = 1024*1024 * 4 bytes = 4 MB of float data.
+  // With a 1 MB limit, this should NOT be constant folded.
+  // With a 8 MB limit, this SHOULD be constant folded.
+
+  auto build_model = [&](ModelTestBuilder& builder) {
+    auto* input_data = builder.MakeInitializer<float>({1}, {1.0f});
+    auto* shape_data = builder.MakeInitializer<int64_t>({2}, {1024, 1024});
+    auto* output_arg = builder.MakeOutput();
+
+    builder.AddNode("Expand", {input_data, shape_data}, {output_arg});
+  };
+
+  // Test 1: With a 1 MB limit, the Expand node should NOT be folded (output is ~4 MB).
+  {
+    auto pre_graph_checker = [](Graph& graph) -> Status {
+      auto op_to_count = CountOpsInGraph(graph);
+      TEST_RETURN_IF_NOT(op_to_count["Expand"] == 1);
+      return Status::OK();
+    };
+
+    auto post_graph_checker = [](Graph& graph) -> Status {
+      auto op_to_count = CountOpsInGraph(graph);
+      // Expand should remain because output is too large
+      TEST_RETURN_IF_NOT(op_to_count["Expand"] == 1);
+      return Status::OK();
+    };
+
+    std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+    ConfigOptions config_options;
+    // Set limit to 1 MB
+    ASSERT_STATUS_OK(config_options.AddConfigEntry(
+        kOrtSessionOptionsConstantFoldingMaxOutputSizeInBytes, "1048576"));
+
+    ASSERT_STATUS_OK(TestGraphTransformer(build_model, 14, *logger_,
+                                          std::make_unique<ConstantFolding>(*e.get(), false, config_options),
+                                          TransformerLevel::Level1, 1,
+                                          pre_graph_checker, post_graph_checker));
+  }
+
+  // Test 2: With an 8 MB limit, the Expand node SHOULD be folded (output is ~4 MB).
+  {
+    auto pre_graph_checker = [](Graph& graph) -> Status {
+      auto op_to_count = CountOpsInGraph(graph);
+      TEST_RETURN_IF_NOT(op_to_count["Expand"] == 1);
+      return Status::OK();
+    };
+
+    auto post_graph_checker = [](Graph& graph) -> Status {
+      auto op_to_count = CountOpsInGraph(graph);
+      // Expand should be folded since output is within limit
+      TEST_RETURN_IF_NOT(op_to_count["Expand"] == 0);
+      return Status::OK();
+    };
+
+    std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+    ConfigOptions config_options;
+    // Set limit to 8 MB
+    ASSERT_STATUS_OK(config_options.AddConfigEntry(
+        kOrtSessionOptionsConstantFoldingMaxOutputSizeInBytes, "8388608"));
+
+    ASSERT_STATUS_OK(TestGraphTransformer(build_model, 14, *logger_,
+                                          std::make_unique<ConstantFolding>(*e.get(), false, config_options),
+                                          TransformerLevel::Level1, 1,
+                                          pre_graph_checker, post_graph_checker));
+  }
+}
+
+// Test that an explicitly configured constant folding output-size limit blocks
+// folding a very large ConstantOfShape output.
+TEST_F(GraphTransformationTests, ConstantFoldingConfiguredLimitBlocksLargeConstantOfShape) {
+  // Build a model with a ConstantOfShape node producing a huge output.
+  // Shape = [16384, 16384] = 268M elements * 4 bytes = 1 GB.
+  // Use an explicit 512 MB limit so the 1 GB output is not folded.
+
+  auto build_model = [&](ModelTestBuilder& builder) {
+    auto* shape_data = builder.MakeInitializer<int64_t>({2}, {16384, 16384});
+    auto* output_arg = builder.MakeOutput();
+
+    auto& node = builder.AddNode("ConstantOfShape", {shape_data}, {output_arg});
+    // Default value is float 0.0
+    ONNX_NAMESPACE::AttributeProto value_attr;
+    value_attr.set_name("value");
+    value_attr.set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_TENSOR);
+    auto* tensor = value_attr.mutable_t();
+    tensor->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    tensor->add_dims(1);
+    tensor->add_float_data(0.0f);
+    node.AddAttributeProto(std::move(value_attr));
+  };
+
+  auto pre_graph_checker = [](Graph& graph) -> Status {
+    auto op_to_count = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_to_count["ConstantOfShape"] == 1);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [](Graph& graph) -> Status {
+    auto op_to_count = CountOpsInGraph(graph);
+    // ConstantOfShape should remain because output is too large (1 GB > 512 MB limit)
+    TEST_RETURN_IF_NOT(op_to_count["ConstantOfShape"] == 1);
+    return Status::OK();
+  };
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  ConfigOptions config_options;
+  // Set limit to 512 MB so the 1 GB output is blocked
+  ASSERT_STATUS_OK(config_options.AddConfigEntry(
+      kOrtSessionOptionsConstantFoldingMaxOutputSizeInBytes, "536870912"));
+
+  ASSERT_STATUS_OK(TestGraphTransformer(build_model, 14, *logger_,
+                                        std::make_unique<ConstantFolding>(*e.get(), false, config_options),
+                                        TransformerLevel::Level1, 1,
+                                        pre_graph_checker, post_graph_checker));
+}
+
+// Test that small constant folding still works with the size limit.
+TEST_F(GraphTransformationTests, ConstantFoldingSmallOutputAllowed) {
+  // Build a model with a small Expand: scalar -> [4, 4] = 16 * 4 = 64 bytes.
+  // This is well within even a small limit and should be folded.
+
+  auto build_model = [&](ModelTestBuilder& builder) {
+    auto* input_data = builder.MakeInitializer<float>({1}, {42.0f});
+    auto* shape_data = builder.MakeInitializer<int64_t>({2}, {4, 4});
+    auto* output_arg = builder.MakeOutput();
+
+    builder.AddNode("Expand", {input_data, shape_data}, {output_arg});
+  };
+
+  auto pre_graph_checker = [](Graph& graph) -> Status {
+    auto op_to_count = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_to_count["Expand"] == 1);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [](Graph& graph) -> Status {
+    auto op_to_count = CountOpsInGraph(graph);
+    // Small Expand should be constant folded
+    TEST_RETURN_IF_NOT(op_to_count["Expand"] == 0);
+    return Status::OK();
+  };
+
+  std::unique_ptr<CPUExecutionProvider> e = std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  ASSERT_STATUS_OK(TestGraphTransformer(build_model, 14, *logger_,
+                                        std::make_unique<ConstantFolding>(*e.get(), false, empty_config_options),
+                                        TransformerLevel::Level1, 1,
+                                        pre_graph_checker, post_graph_checker));
+}
+
+// Test that constant folding gracefully handles an Expand node whose output shape
+// dimensions would cause integer overflow. This simulates the attack vector where
+// a malicious model embeds constant initializers with extreme shape values, causing
+// kernel Compute() to execute during graph optimization. The SafeInt-protected
+// arithmetic in Expand::Compute (or TensorShape overflow) should be caught by the
+// try/catch around Compute, and the node should NOT be constant folded.
+TEST_F(GraphTransformationTests, ConstantFoldingExpandOverflowDimsSkipped) {
+  constexpr int64_t kLargeDim = int64_t(1) << 32;  // 4294967296
+
+  auto build_model = [&](ModelTestBuilder& builder) {
+    auto* input_data = builder.MakeInitializer<float>({1}, {1.0f});
+    auto* shape_data = builder.MakeInitializer<int64_t>({2}, {kLargeDim, kLargeDim});
+    auto* output_arg = builder.MakeOutput();
+
+    builder.AddNode("Expand", {input_data, shape_data}, {output_arg});
+  };
+
+  auto pre_graph_checker = [](Graph& graph) -> Status {
+    auto op_to_count = CountOpsInGraph(graph);
+    TEST_RETURN_IF_NOT(op_to_count["Expand"] == 1);
+    return Status::OK();
+  };
+
+  auto post_graph_checker = [](Graph& graph) -> Status {
+    auto op_to_count = CountOpsInGraph(graph);
+    // Expand should remain because the overflow prevents constant folding.
+    TEST_RETURN_IF_NOT(op_to_count["Expand"] == 1);
+    return Status::OK();
+  };
+
+  std::unique_ptr<CPUExecutionProvider> e =
+      std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo());
+  const ConfigOptions empty_config_options;
+
+  ASSERT_STATUS_OK(TestGraphTransformer(build_model, 14, *logger_,
+                                        std::make_unique<ConstantFolding>(*e.get(), false, empty_config_options),
+                                        TransformerLevel::Level1, 1,
+                                        pre_graph_checker, post_graph_checker));
+}
+
 // Check transformations in the case of a subgraph with constant inputs.
 TEST_F(GraphTransformationTests, SubgraphWithConstantInputs) {
   constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "constant-subgraph.onnx";
@@ -10230,5 +10426,65 @@ TEST_F(GraphTransformationTests, DivMulFusion_MultiElementInitializer) {
 // `dropout_elimination.cc` remains as pure defense-in-depth against future
 // internal callers that may bypass shape inference.
 
+// These tests verify that STFTDecomposition skips malformed models
+// instead of crashing with OOB writes from negative initializer values.
+TEST_F(GraphTransformationTests, STFTDecomposition_NegativeFrameLength) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "stft_negative_frame_length.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["STFT"], 1);
+
+  const InlinedHashSet<std::string_view> empty_ep = {};
+  auto stft_transformer = std::make_unique<STFTDecomposition>(empty_ep);
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(stft_transformer), TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  // STFT node should NOT be decomposed — transformer skips invalid models
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["STFT"], 1);
+}
+
+TEST_F(GraphTransformationTests, STFTDecomposition_NegativeFrameStep) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "stft_negative_frame_step.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["STFT"], 1);
+
+  const InlinedHashSet<std::string_view> empty_ep = {};
+  auto stft_transformer = std::make_unique<STFTDecomposition>(empty_ep);
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(stft_transformer), TransformerLevel::Level1));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  // STFT node should NOT be decomposed — transformer skips invalid models
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["STFT"], 1);
+}
+
+TEST_F(GraphTransformationTests, STFTDecomposition_NoWindowInput) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "stft_no_window.onnx";
+  std::shared_ptr<Model> model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, model, nullptr, *logger_));
+  Graph& graph = model->MainGraph();
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["STFT"], 1);
+
+  const InlinedHashSet<std::string_view> empty_ep = {};
+  auto stft_transformer = std::make_unique<STFTDecomposition>(empty_ep);
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(stft_transformer), TransformerLevel::Level1));
+  // Should not crash (previously dereferenced nullptr window_recipient)
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_));
+
+  // Valid windowless STFT should be successfully decomposed
+  op_to_count = CountOpsInGraph(graph);
+  ASSERT_EQ(op_to_count["STFT"], 0);
+}
+
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
index cd210f7bc70ba..a07f173950051 100644
--- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc
@@ -202,6 +202,7 @@ void NchwcOptimizerTester(const std::function<void(NchwcTestHelper& helper)>& bu
     session_options.session_logid = "NchwcOptimizerTests";
     InferenceSessionWrapper session{session_options, GetEnvironment()};
     ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast<int>(model_data.size())));
+    ASSERT_STATUS_OK(session.FilterEnabledOptimizers({"NhwcTransformer"}));
     ASSERT_STATUS_OK(session.Initialize());
 
     RunOptions run_options;
@@ -643,6 +644,36 @@ TEST(NchwcOptimizerTests, FusedConvAddFusion) {
   test_case(true, true, 1);
 }
 
+TEST(NchwcOptimizerTests, PreExistingFusedConvWithNchwcSumInput) {
+  auto build_test_case = [&](NchwcTestHelper& helper) {
+    auto* input_arg = helper.MakeInput<float>({1, 32, 28, 28});
+    auto* sum_arg = helper.MakeIntermediate();
+    auto* output_arg = helper.MakeOutput();
+
+    auto& sum_node = helper.AddConvNode(input_arg, sum_arg, {32, 32, 3, 3});
+    sum_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+
+    auto* weights_arg = helper.MakeInitializer({32, 32, 3, 3});
+    auto* bias_arg = helper.MakeInitializer({32});
+    auto& fused_conv_node =
+        helper.AddNode("FusedConv", {input_arg, weights_arg, bias_arg, sum_arg}, {output_arg}, kMSDomain);
+    fused_conv_node.AddAttribute("activation", "Relu");
+    fused_conv_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+    fused_conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+    fused_conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+  };
+
+  auto check_nchwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    EXPECT_EQ(op_to_count["com.microsoft.nchwc.Conv"], 2);
+    EXPECT_EQ(op_to_count["com.microsoft.nchwc.ReorderInput"], 1);
+    EXPECT_EQ(op_to_count["com.microsoft.nchwc.ReorderOutput"], 1);
+    EXPECT_EQ(op_to_count["com.microsoft.FusedConv"], 0);
+  };
+
+  NchwcOptimizerTester(build_test_case, check_nchwc_graph);
+}
+
 TEST(NchwcOptimizerTests, ConvBinary) {
   auto test_case = [&](const std::string& op_type) {
     auto build_test_case = [&](NchwcTestHelper& helper) {
diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
index 21ea7af4e7389..b73929efab8a6 100644
--- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc
+++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc
@@ -1,14 +1,23 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include <array>
 #include <random>
+#include <string_view>
 #include <vector>
 
 #include "gtest/gtest.h"
-#include "test/unittest_util/graph_transform_test_builder.h"
+#include "core/framework/kernel_registry.h"
 #include "core/mlas/inc/mlas.h"
+#include "core/providers/common.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
+#include "test/unittest_util/graph_transform_test_builder.h"
+#if defined(USE_KLEIDIAI) && defined(MLAS_TARGET_ARM64)
+#include "core/mlas/lib/mlasi.h"
+#endif
 #include "core/graph/graph.h"
 #include "test/common/dnnl_op_test_utils.h"
+#include "test/util/include/test_environment.h"
 
 namespace onnxruntime {
 namespace test {
@@ -33,6 +42,180 @@ NodeArg* NhwcMakeInitializer(ModelTestBuilder& builder, const std::vector<int64_
                                     NhwcWeightsRange<T>::max_value);
 }
 
+#if defined(USE_KLEIDIAI) && defined(MLAS_TARGET_ARM64)
+static bool HasFloatNhwcFusedConvKernel() {
+  auto* cpu_ep = TestCPUExecutionProvider();
+  auto kernel_registry = cpu_ep->GetKernelRegistry();
+  if (!kernel_registry) {
+    return false;
+  }
+
+  KernelRegistry::TypeConstraintMap type_constraints{
+      {"T", DataTypeImpl::GetTensorType<float>()},
+  };
+
+  const KernelCreateInfo* kernel_create_info{};
+  const auto status = kernel_registry->TryFindKernel(
+      kCpuExecutionProvider,
+      "NhwcFusedConv",
+      kMSDomain,
+      1,
+      type_constraints,
+      DefaultLoggingManager().DefaultLogger(),
+      &kernel_create_info);
+
+  return status.IsOK() && kernel_create_info != nullptr;
+}
+#endif
+
+static bool HasFloatNhwcNoTransposeSupport(const std::vector<int64_t>& input_shape,
+                                           const std::vector<int64_t>& weight_shape,
+                                           std::vector<int64_t> pads = {},
+                                           std::vector<int64_t> strides = {},
+                                           std::vector<int64_t> dilations = {},
+                                           int64_t group = 1,
+                                           bool has_sum_input = false,
+                                           std::string_view auto_pad = "NOTSET") {
+#if defined(USE_KLEIDIAI) && defined(MLAS_TARGET_ARM64)
+  if (!HasFloatNhwcFusedConvKernel() || !MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()) {
+    return false;
+  }
+
+  if (has_sum_input || group != 1 || input_shape.size() != 4 || weight_shape.size() != 4) {
+    return false;
+  }
+
+  std::array<size_t, 2> input_spatial_shape{
+      narrow<size_t>(input_shape[2]),
+      narrow<size_t>(input_shape[3]),
+  };
+  std::array<size_t, 2> kernel_spatial_shape{
+      narrow<size_t>(weight_shape[2]),
+      narrow<size_t>(weight_shape[3]),
+  };
+  std::array<size_t, 2> strides_size_t{1, 1};
+  std::array<size_t, 2> dilations_size_t{1, 1};
+  std::array<size_t, 4> pads_size_t{};
+
+  if (!strides.empty()) {
+    if (strides.size() != strides_size_t.size()) {
+      return false;
+    }
+
+    for (size_t i = 0; i < strides_size_t.size(); ++i) {
+      if (strides[i] < 0) {
+        return false;
+      }
+
+      strides_size_t[i] = narrow<size_t>(strides[i]);
+    }
+  }
+
+  if (!dilations.empty()) {
+    if (dilations.size() != dilations_size_t.size()) {
+      return false;
+    }
+
+    for (size_t i = 0; i < dilations_size_t.size(); ++i) {
+      if (dilations[i] < 0) {
+        return false;
+      }
+
+      dilations_size_t[i] = narrow<size_t>(dilations[i]);
+    }
+  }
+
+  const AutoPadType auto_pad_type = StringToAutoPadType(std::string(auto_pad));
+  if (auto_pad_type == AutoPadType::NOTSET) {
+    if (pads.empty()) {
+      pads_size_t.fill(0);
+    } else {
+      if (pads.size() != pads_size_t.size()) {
+        return false;
+      }
+
+      for (size_t i = 0; i < pads_size_t.size(); ++i) {
+        if (pads[i] < 0) {
+          return false;
+        }
+
+        pads_size_t[i] = narrow<size_t>(pads[i]);
+      }
+    }
+  } else {
+    for (size_t i = 0; i < 2; ++i) {
+      int64_t pad_head = 0;
+      int64_t pad_tail = 0;
+      int64_t out_dim = 0;
+      const auto status = ComputePadAndOutputShape(
+          input_shape[2 + i],
+          narrow<int64_t>(strides_size_t[i]),
+          weight_shape[2 + i],
+          narrow<int64_t>(dilations_size_t[i]),
+          auto_pad_type,
+          pad_head,
+          pad_tail,
+          out_dim,
+          /*force_symmetric_auto_padding*/ false);
+      if (!status.IsOK() || pad_head < 0 || pad_tail < 0 || out_dim < 0) {
+        return false;
+      }
+
+      pads_size_t[i] = narrow<size_t>(pad_head);
+      pads_size_t[i + 2] = narrow<size_t>(pad_tail);
+    }
+  }
+
+  return MlasConvSupportsSymmetricChannelsLast2DFloatKernel(
+      /*Dimensions*/ 2,
+      narrow<size_t>(input_shape[0]),
+      /*GroupCount*/ 1,
+      input_spatial_shape.data(),
+      kernel_spatial_shape.data(),
+      dilations_size_t.data(),
+      pads_size_t.data(),
+      strides_size_t.data(),
+      narrow<size_t>(weight_shape[0]),
+      /*Beta*/ 0.0f);
+#else
+  ORT_UNUSED_PARAMETER(input_shape);
+  ORT_UNUSED_PARAMETER(weight_shape);
+  ORT_UNUSED_PARAMETER(pads);
+  ORT_UNUSED_PARAMETER(strides);
+  ORT_UNUSED_PARAMETER(dilations);
+  ORT_UNUSED_PARAMETER(group);
+  ORT_UNUSED_PARAMETER(has_sum_input);
+  ORT_UNUSED_PARAMETER(auto_pad);
+  return false;
+#endif
+}
+
+#ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED
+static bool HasFp16NhwcFusedConvKernel() {
+  auto* cpu_ep = TestCPUExecutionProvider();
+  auto kernel_registry = cpu_ep->GetKernelRegistry();
+  if (!kernel_registry) {
+    return false;
+  }
+
+  KernelRegistry::TypeConstraintMap type_constraints{
+      {"T", DataTypeImpl::GetTensorType<MLFloat16>()},
+  };
+
+  const KernelCreateInfo* kernel_create_info{};
+  const auto status = kernel_registry->TryFindKernel(
+      kCpuExecutionProvider,
+      "NhwcFusedConv",
+      kMSDomain,
+      1,
+      type_constraints,
+      DefaultLoggingManager().DefaultLogger(),
+      &kernel_create_info);
+
+  return status.IsOK() && kernel_create_info != nullptr;
+}
+#endif
+
 #ifndef DISABLE_CONTRIB_OPS
 
 TEST(NhwcTransformerTests, Conv) {
@@ -224,6 +407,196 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePool) {
                     TransformerLevel::Level3);
 }
 
+TEST(NhwcTransformerTests, ConvDepthwiseFloat_SkipNhwc) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 7, 7}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({8, 1, 3, 3}, -1.0f, 1.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& conv_node = builder.AddConvNode(input_arg, weight_arg, output_arg);
+    conv_node.AddAttribute("group", static_cast<int64_t>(8));
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const bool expect_nhwc = HasFloatNhwcNoTransposeSupport({1, 8, 7, 7}, {8, 1, 3, 3}, {}, {}, {}, 8);
+    const int expected_nhwc_fused_conv = expect_nhwc ? 1 : 0;
+    const int expected_transposes = expect_nhwc ? 2 : 0;
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], expected_nhwc_fused_conv);
+    EXPECT_EQ(op_to_count["Transpose"], expected_transposes);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6);
+}
+
+TEST(NhwcTransformerTests, ConvFloat_UsesNhwcOnlyWithKleidi) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 7, 7}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({16, 8, 3, 3}, -1.0f, 1.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& conv_node = builder.AddConvNode(input_arg, weight_arg, output_arg);
+    conv_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const bool expect_nhwc = HasFloatNhwcNoTransposeSupport({1, 8, 7, 7}, {16, 8, 3, 3}, {1, 1, 1, 1});
+    const int expected_nhwc_fused_conv = expect_nhwc ? 1 : 0;
+    const int expected_transposes = expect_nhwc ? 2 : 0;
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], expected_nhwc_fused_conv);
+    EXPECT_EQ(op_to_count["Transpose"], expected_transposes);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6);
+}
+
+TEST(NhwcTransformerTests, ConvFloat_RespectsKleidiDisableConfig) {
+  if (!HasFloatNhwcNoTransposeSupport({1, 8, 7, 7}, {16, 8, 3, 3}, {1, 1, 1, 1})) {
+    GTEST_SKIP() << "Float NHWC KleidiAI path is not available on this configuration.";
+  }
+
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 7, 7}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({16, 8, 3, 3}, -1.0f, 1.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& conv_node = builder.AddConvNode(input_arg, weight_arg, output_arg);
+    conv_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], 0);
+    EXPECT_EQ(op_to_count["Transpose"], 0);
+  };
+
+  auto add_session_options = [](SessionOptions& session_options) {
+    const auto status = session_options.config_options.AddConfigEntry(kOrtSessionOptionsMlasDisableKleidiAi, "1");
+    ASSERT_TRUE(status.IsOK()) << status.ErrorMessage();
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6,
+                    nullptr,
+                    add_session_options);
+}
+
+TEST(NhwcTransformerTests, FusedConvFloat_UsesNhwcOnlyWithKleidi) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 7, 7}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({16, 8, 3, 3}, -1.0f, 1.0f);
+    auto* bias_arg = builder.MakeInitializer<float>({16}, -0.5f, 0.5f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& fused_conv_node = builder.AddNode("FusedConv", {input_arg, weight_arg, bias_arg}, {output_arg}, kMSDomain);
+    fused_conv_node.AddAttribute("activation", "Relu");
+    fused_conv_node.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+    fused_conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+    fused_conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const bool expect_nhwc = HasFloatNhwcNoTransposeSupport(
+        {1, 8, 7, 7}, {16, 8, 3, 3}, {1, 1, 1, 1}, {1, 1});
+    const int expected_nhwc_fused_conv = expect_nhwc ? 1 : 0;
+    const int expected_transposes = expect_nhwc ? 2 : 0;
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], expected_nhwc_fused_conv);
+    EXPECT_EQ(op_to_count["Transpose"], expected_transposes);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6);
+}
+
+TEST(NhwcTransformerTests, FusedConvWithSumFloat_SkipNhwc) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 7, 7}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({16, 8, 3, 3}, -1.0f, 1.0f);
+    auto* bias_arg = builder.MakeInitializer<float>({16}, -0.5f, 0.5f);
+    auto* sum_arg = builder.MakeInput<float>({1, 16, 5, 5}, -1.0f, 1.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& fused_conv_node = builder.AddNode("FusedConv", {input_arg, weight_arg, bias_arg, sum_arg}, {output_arg}, kMSDomain);
+    fused_conv_node.AddAttribute("activation", "Relu");
+    fused_conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+    fused_conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+    fused_conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    EXPECT_EQ(op_to_count["com.microsoft.nchwc.Conv"], 0);
+    const bool expect_nhwc = HasFloatNhwcNoTransposeSupport(
+        {1, 8, 7, 7}, {16, 8, 3, 3}, {0, 0, 0, 0}, {1, 1}, {}, 1, true);
+    const int expected_nhwc_fused_conv = expect_nhwc ? 1 : 0;
+    const int expected_transposes = expect_nhwc ? 3 : 0;
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], expected_nhwc_fused_conv);
+    EXPECT_EQ(op_to_count["Transpose"], expected_transposes);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6);
+}
+
+TEST(NhwcTransformerTests, ConvAutoPadFloat_SkipNhwc) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = builder.MakeInput<float>({1, 8, 6, 6}, -1.0f, 1.0f);
+    auto* weight_arg = builder.MakeInitializer<float>({16, 8, 3, 3}, -1.0f, 1.0f);
+    auto* output_arg = builder.MakeOutput();
+
+    Node& conv_node = builder.AddConvNode(input_arg, weight_arg, output_arg);
+    conv_node.AddAttribute("auto_pad", "SAME_UPPER");
+    conv_node.AddAttribute("strides", std::vector<int64_t>{2, 2});
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const bool expect_nhwc = HasFloatNhwcNoTransposeSupport(
+        {1, 8, 6, 6}, {16, 8, 3, 3}, {}, {2, 2}, {}, 1, false, "SAME_UPPER");
+    const int expected_nhwc_fused_conv = expect_nhwc ? 1 : 0;
+    const int expected_transposes = expect_nhwc ? 2 : 0;
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], expected_nhwc_fused_conv);
+    EXPECT_EQ(op_to_count["Transpose"], expected_transposes);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3,
+                    /*opset_version*/ 12,
+                    /*per_sample_tolerance*/ 1e-6,
+                    /*relative_per_sample_tolerance*/ 1e-6);
+}
+
 TEST(NhwcTransformerTests, ConvAveragePool) {
   DNNL_GTEST_SKIP();
 
@@ -598,6 +971,35 @@ TEST_F(NhwcTransformerTestsFp16, ConvFp16) {
   test_case({1, 22, 11, 13, 15}, {30, 22, 5, 3, 3});
 }
 
+TEST_F(NhwcTransformerTestsFp16, FusedConvWithSumFp16) {
+  auto build_test_case = [&](ModelTestBuilder& builder) {
+    auto* input_arg = MakeInputARangeFP16(builder, {1, 8, 7, 7}, MLFloat16(-1.0f), MLFloat16(1.0f));
+    auto* weight_arg = MakeInitializerARangeFP16(builder, {16, 8, 3, 3}, MLFloat16(-1.0f), MLFloat16(1.0f));
+    auto* bias_arg = MakeInitializerARangeFP16(builder, {16}, MLFloat16(-0.5f), MLFloat16(0.5f));
+    auto* sum_arg = MakeInputARangeFP16(builder, {1, 16, 5, 5}, MLFloat16(-1.0f), MLFloat16(1.0f));
+    auto* output_arg = builder.MakeOutput();
+
+    Node& fused_conv_node = builder.AddNode("FusedConv", {input_arg, weight_arg, bias_arg, sum_arg}, {output_arg}, kMSDomain);
+    fused_conv_node.AddAttribute("activation", "Relu");
+    fused_conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+    fused_conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+    fused_conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+  };
+
+  auto check_nhwc_graph = [&](InferenceSessionWrapper& session) {
+    auto op_to_count = CountOpsInGraph(session.GetGraph());
+    const int expected_nhwc_fused_conv = HasFp16NhwcFusedConvKernel() ? 1 : 0;
+    const int expected_transposes = HasFp16NhwcFusedConvKernel() ? 3 : 0;
+    EXPECT_EQ(op_to_count["com.microsoft.NhwcFusedConv"], expected_nhwc_fused_conv);
+    EXPECT_EQ(op_to_count["Transpose"], expected_transposes);
+  };
+
+  TransformerTester(build_test_case,
+                    check_nhwc_graph,
+                    TransformerLevel::Level2,
+                    TransformerLevel::Level3);
+}
+
 TEST_F(NhwcTransformerTestsFp16, ConvMaxPoolFp16) {
   auto test_case = [&](const std::vector<int64_t>& input_shape, const std::vector<int64_t>& weights_shape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
index b57118bbb4ba3..080c382db5d93 100644
--- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc
+++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc
@@ -4604,6 +4604,66 @@ TEST(TransposeOptimizerTests, QnnTransposeReshape) {
   }
 }
 
+// Verifies that layout transformation preserves an existing NHWC-native
+// NhwcFusedConv as-is instead of retargeting it or inserting Transpose nodes.
+TEST(TransposeOptimizerTests, LayoutTransformDoesNotRetargetNhwcFusedConv) {
+  std::unordered_map<std::string, int> domain_to_version{{kOnnxDomain, 13}, {kMSDomain, 1}};
+  Model model("LayoutTransformDoesNotRetargetNhwcFusedConv", false, ModelMetaData(), PathString(),
+              IOnnxRuntimeOpSchemaRegistryList(), domain_to_version, {},
+              DefaultLoggingManager().DefaultLogger());
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder builder(graph);
+
+  auto* input_arg = builder.MakeInput<float>({1, 7, 7, 8}, -1.0f, 1.0f);
+  auto* weight_arg = builder.MakeInitializer<float>({16, 8, 3, 3}, -1.0f, 1.0f);
+  auto* bias_arg = builder.MakeInitializer<float>({16}, -0.5f, 0.5f);
+  auto* output_arg = builder.MakeOutput();
+
+  auto& nhwc_fused_conv = builder.AddNode("NhwcFusedConv", {input_arg, weight_arg, bias_arg}, {output_arg}, kMSDomain);
+  nhwc_fused_conv.AddAttribute("activation", "Relu");
+  nhwc_fused_conv.AddAttribute("pads", std::vector<int64_t>{1, 1, 1, 1});
+  nhwc_fused_conv.AddAttribute("strides", std::vector<int64_t>{1, 1});
+  nhwc_fused_conv.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+
+  builder.SetGraphOutputs();
+  ASSERT_STATUS_OK(graph.Resolve());
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+
+  SessionOptions so;
+  using InternalTestingEP = internal_testing_ep::InternalTestingExecutionProvider;
+  const std::unordered_set<std::string> empty_set;
+  auto internal_testing_ep = std::make_unique<InternalTestingEP>(empty_set, empty_set, DataLayout::NHWC);
+  internal_testing_ep->EnableStaticKernels().TakeAllNodes();
+
+  InferenceSessionWrapper session{so, GetEnvironment()};
+  ASSERT_STATUS_OK(session.RegisterExecutionProvider(std::move(internal_testing_ep)));
+  ASSERT_STATUS_OK(session.Load(model_data.data(), static_cast<int>(model_data.size())));
+  ASSERT_STATUS_OK(session.Initialize());
+
+  const auto& optimized_graph = session.GetGraph();
+  const auto op_to_count = CountOpsInGraph(optimized_graph);
+  const auto get_op_count = [&op_to_count](std::string_view op_type) {
+    const auto it = op_to_count.find(std::string{op_type});
+    return it == op_to_count.end() ? 0 : it->second;
+  };
+
+  EXPECT_EQ(get_op_count("com.microsoft.NhwcFusedConv"), 1);
+  EXPECT_EQ(get_op_count("Transpose"), 0);
+
+  int nhwc_fused_conv_count = 0;
+  for (const auto& node : optimized_graph.Nodes()) {
+    if (node.OpType() == "NhwcFusedConv") {
+      ++nhwc_fused_conv_count;
+      EXPECT_EQ(node.Domain(), kMSDomain);
+      EXPECT_EQ(node.GetExecutionProviderType(), internal_testing_ep::kInternalTestingExecutionProvider);
+    }
+  }
+
+  EXPECT_EQ(nhwc_fused_conv_count, 1);
+}
+
 TEST(TransposeOptimizerTests, QnnTransposeReshapeQDQ) {
   Status status;
   auto model_uri = ORT_TSTR("testdata/layout_transform_reshape.qdq.onnx");
diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
index 3b7e93b8f7668..c23e843b6f143 100644
--- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc
@@ -1351,6 +1351,70 @@ TEST(LSTMTest, ONNXRuntime_TestLSTMZeroSeqInMiddle) {
 
 #ifndef ENABLE_TRAINING
 // Prepacking is disabled in full training build so no need to test the feature in a training build.
+TEST(LSTMTest, ONNXRuntime_TestLSTMForward_OpSet22_CUDA) {
+  auto cuda_ep = DefaultCudaExecutionProvider();
+  if (!cuda_ep) {
+    return;
+  }
+
+  constexpr int seq_len = 2, batch_size = 1;
+  constexpr int64_t input_size = 1, hidden_size = 2;
+  constexpr int num_directions = 1;
+
+  OpTester test("LSTM", 22);
+
+  test.AddAttribute<std::vector<string>>("activations", {"sigmoid", "tanh", "tanh"});
+  test.AddAttribute("direction", "forward");
+  test.AddAttribute("hidden_size", hidden_size);
+
+  // X shape: [seq_len, batch_size, input_size] = [2, 1, 1]
+  std::vector<float> X_data = {-0.455351f, -0.185934f};
+  std::vector<int64_t> X_dims = {seq_len, batch_size, input_size};
+
+  // W shape: [num_directions, 4*hidden_size, input_size] = [1, 8, 1]
+  std::vector<float> W_data = {-0.494659f, 0.0453352f,
+                               -0.487793f, 0.417264f,
+                               -0.0175329f, 0.489074f,
+                               -0.446013f, 0.414029f};
+  std::vector<int64_t> W_dims = {num_directions, 4 * hidden_size, input_size};
+
+  // R shape: [num_directions, 4*hidden_size, hidden_size] = [1, 8, 2]
+  std::vector<float> R_data = {0.146304f, -0.0243403f,
+                               -0.487793f, 0.417264f,
+                               -0.0175329f, 0.489074f,
+                               -0.446013f, 0.414029f,
+                               0.146304f, -0.0243403f,
+                               -0.487793f, 0.417264f,
+                               -0.0175329f, 0.489074f,
+                               -0.446013f, 0.414029f};
+  std::vector<int64_t> R_dims = {num_directions, 4 * hidden_size, hidden_size};
+
+  test.AddInput<float>("X", X_dims, X_data);
+  test.AddInput<float>("W", W_dims, W_data, true);
+  test.AddInput<float>("R", R_dims, R_data, true);
+
+  // B, sequence_lens, initial_h, initial_c, P - all optional, not provided
+  test.AddOptionalInputEdge<float>();  // B
+  test.AddOptionalInputEdge<int>();    // sequence_lens
+  test.AddOptionalInputEdge<float>();  // initial_h
+  test.AddOptionalInputEdge<float>();  // initial_c
+  test.AddOptionalInputEdge<float>();  // P
+
+  // Expected values computed via reference LSTM implementation with the same weights.
+  // Y (full sequence output) shape: [seq_len, num_directions, batch_size, hidden_size]
+  std::vector<int64_t> Y_dims = {seq_len, num_directions, batch_size, hidden_size};
+  test.AddOutput<float>("Y", Y_dims, {0.0616098f, -0.0416164f, 0.0455843f, -0.0476148f}, false, 1e-4f, 1e-4f);
+  // Y_h (final hidden state)
+  std::vector<int64_t> Y_h_dims = {num_directions, batch_size, hidden_size};
+  test.AddOutput<float>("Y_h", Y_h_dims, {0.0455843f, -0.0476148f}, false, 1e-4f, 1e-4f);
+  // Y_c
+  test.AddOptionalOutputEdge<float>();
+
+  std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+  execution_providers.push_back(std::move(cuda_ep));
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
+}
+
 TEST(LSTMTest, SharedPrepackedWeights) {
   int64_t seq_length = 2;
   int batch_size = 2;
diff --git a/onnxruntime/test/testdata/transform/stft_negative_frame_length.onnx b/onnxruntime/test/testdata/transform/stft_negative_frame_length.onnx
new file mode 100644
index 0000000000000..6cfb53f2fd431
Binary files /dev/null and b/onnxruntime/test/testdata/transform/stft_negative_frame_length.onnx differ
diff --git a/onnxruntime/test/testdata/transform/stft_negative_frame_length.py b/onnxruntime/test/testdata/transform/stft_negative_frame_length.py
new file mode 100644
index 0000000000000..d8f00f866c211
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/stft_negative_frame_length.py
@@ -0,0 +1,46 @@
+from onnx import TensorProto, helper, save
+
+
+def make_stft_model(frame_length_value, frame_step_value, has_window, filename):
+    batch = 1
+    signal_length = 16
+    dft_size = abs(frame_length_value) if frame_length_value != 0 else 4
+    onesided_bins = dft_size // 2 + 1
+    num_frames = max(1, (signal_length - dft_size) // abs(frame_step_value) + 1) if frame_step_value != 0 else 1
+
+    signal = helper.make_tensor_value_info("signal", TensorProto.FLOAT, [batch, signal_length, 1])
+    output = helper.make_tensor_value_info("output", TensorProto.FLOAT, [batch, num_frames, onesided_bins, 2])
+
+    frame_step_init = helper.make_tensor("frame_step", TensorProto.INT64, [], [frame_step_value])
+    frame_length_init = helper.make_tensor("frame_length", TensorProto.INT64, [], [frame_length_value])
+
+    window_input = "window" if has_window else ""
+    initializers = [frame_step_init, frame_length_init]
+    if has_window:
+        window_init = helper.make_tensor("window", TensorProto.FLOAT, [dft_size], [1.0] * dft_size)
+        initializers.append(window_init)
+
+    stft_node = helper.make_node(
+        "STFT",
+        inputs=["signal", "frame_step", window_input, "frame_length"],
+        outputs=["output"],
+        onesided=1,
+    )
+
+    graph = helper.make_graph(
+        [stft_node],
+        "stft_test",
+        [signal],
+        [output],
+        initializer=initializers,
+    )
+
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 17)])
+    model.ir_version = 8
+    save(model, filename)
+
+
+if __name__ == "__main__":
+    make_stft_model(-2, 4, False, "stft_negative_frame_length.onnx")
+    make_stft_model(4, -2, False, "stft_negative_frame_step.onnx")
+    make_stft_model(4, 4, False, "stft_no_window.onnx")
diff --git a/onnxruntime/test/testdata/transform/stft_negative_frame_step.onnx b/onnxruntime/test/testdata/transform/stft_negative_frame_step.onnx
new file mode 100644
index 0000000000000..0782996ad09ee
Binary files /dev/null and b/onnxruntime/test/testdata/transform/stft_negative_frame_step.onnx differ
diff --git a/onnxruntime/test/testdata/transform/stft_no_window.onnx b/onnxruntime/test/testdata/transform/stft_no_window.onnx
new file mode 100644
index 0000000000000..5b26a0cdbbb1b
Binary files /dev/null and b/onnxruntime/test/testdata/transform/stft_no_window.onnx differ