kendryte · xhuohai · Oct 14, 2025 · Oct 14, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceConvertVisitor.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/CSourceConvertVisitor.cs
@@ -119,7 +119,7 @@ public void IndWrite(string? value)
 /// <summary>
 /// convert single prim function to c source.
 /// </summary>
-public abstract class CSourceConvertVisitor : ExprFunctor<CSymbol, Unit>
+public class CSourceConvertVisitor : ExprFunctor<CSymbol, Unit>
 {
     protected readonly Dictionary<BaseExpr, CSymbol> _exprMemo = new(ReferenceEqualityComparer.Instance);
 

diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelCSourceConvertVisitor.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelCSourceConvertVisitor.cs
@@ -447,11 +447,6 @@ protected override CSymbol VisitCall(Call expr)
                 case TIR.NTT.Gather gather:
                     {
                         WriteWithProfiler($"gather({VisitBuffer(args[0], local: false).Name}, {VisitBuffer(args[1], local: true).Name}, {VisitBuffer(args[2], local: true).Name}, {gather.Axis}_dim);\n");
-                        if (args[0] is TIR.Buffer b && b.DistributedType?.AxisPolicies[gather.Axis] is SBPSplit s)
-                        {
-                            var reduceKind = "tar::reduce_kind::" + string.Join("_", Enumerable.Range(0, TargetOptions.HierarchyNames.Length).Select(i => (s.Axes.Contains(i) ? "r" : string.Empty) + TargetOptions.HierarchyNames[i]));
-                            WriteIndWithProfiler($"tac::tensor_reduce_sync<reduce_op::{ReduceOp.Sum.ToC()}, {reduceKind}>({VisitBuffer(args[2], local: true).Name}, {VisitBuffer(args[2], local: true).Name});\n");
-                        }
                     }
 
                     break;
@@ -536,8 +531,8 @@ protected override CSymbol VisitCall(Call expr)
                         {
                             // deprecated
                             var sbpPartial = (SBPPartial)grs.InType.AxisPolicies.Where(s => s is SBPPartial).Distinct().First();
-                            var reduceKind = "tar::reduce_kind::" + string.Join("_", grs.InType.AxisPolicies.Select((s, i) => (s is SBPPartial ? "r" : string.Empty) + TargetOptions.HierarchyNames[i]));
-                            WriteIndWithProfiler($"tac::tensor_reduce_sync<reduce_op::{sbpPartial.Op.ToC()}, {reduceKind}>({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: true).Name});\n");
+                            var reduceKind = "tar::reduce_kind::" + string.Join("_", Enumerable.Range(0, TargetOptions.HierarchyNames.Length).Select(i => (sbpPartial.Axes.Contains(i) ? "r" : string.Empty) + TargetOptions.HierarchyNames[i]));
+                            WriteIndWithProfiler($"tac::tensor_reduce_sync<reduce_op::{sbpPartial.Op.ToC()}, {reduceKind}>({VisitBuffer(args[0], local: true).Name}, {VisitBuffer(args[1], local: false).Name});\n");
                         }
                         else
                         {

diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelUtility.cs b/modules/Nncase.Modules.NTT/CodeGen/CPU/KernelUtility.cs
@@ -35,7 +35,7 @@ public static string SBPToC(this SBP value)
     {
         if (value is SBPSplit s)
         {
-            return $"S<{string.Join(", ", s.Axes)}>()";
+            return $"S<{string.Join(", ", s.Axes)}>({new CSourceConvertVisitor().Visit(s.Granularity as BaseExpr ?? None.Default).Name})";
         }
         else
         {

diff --git a/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/topo_aware_runtime.cshtml b/modules/Nncase.Modules.NTT/CodeGen/CPU/Templates/topo_aware_runtime.cshtml
@@ -177,112 +177,149 @@ class tensor_reduce_sync_impl {
         // collect all tensors pointer for access tensor from other nodes.
         using TElem = typename TIn::element_type;
         using TOutBase = std::decay_t<TOut>;
+        static_assert(ShardedTensor<TOutBase>, "dest must be sharded tensor");
         constexpr size_t Rank = TIn::rank();
         constexpr auto group_hierarchy = group_hierarchy_getter<Kind>::group_hierarchy;
         auto cur_index = ntt::make_shape(@(cur_index));
         auto cur_index_g = index_global2group(cur_index);
         tar::src_ptr_tensor(cur_index) =
             reinterpret_cast<void *>(src.elements().data());
         tar::dest_ptr_tensor(cur_index) =
-            reinterpret_cast<void *>(dest.elements().data());
+            reinterpret_cast<void *>(dest.local().elements().data());
         reduce_group_sync();
 
         // according to the group size split the tensor.
         // todo should using better split strategy.
         constexpr auto group_size = ntt::fixed_dim_v<get_group_size()>;
-        const auto axis = [&] {
-            dim_t axis = -1;
-            loop<Rank>([&](auto i) {
-                if (axis == -1 && src.shape()[i] >= group_size) {
-                    axis = i;
-                }
-            });
-            if (axis == -1) {
-                axis = 0;
-            }
-            return axis;
-        }();
-
-        auto remain = src.shape()[axis] % (group_size);
-        auto frac = src.shape()[axis] / (group_size);
 
-        auto node_number_g = ntt::linear_offset(cur_index_g, group_hierarchy);
-
-        // reduce-scatter, communicate (group_size - 1) times
-        for (auto i = 0; i < group_size - 1; i++) 
-        {
-            auto new_shape = ntt::generate_shape<Rank>([&](auto j) {
-                if (j == axis) {
-                    return ntt::where(node_number_g == group_size - 1, frac + remain, frac);
-                } else {
-                    return (dim_t)src.shape()[j];
-                }
-            });
-            auto starts = ntt::generate_shape<Rank>([&](auto j) {
-                if (j == axis) {
-                    return node_number_g * frac;
-                } else {
-                    return (dim_t)0;
-                }
-            });
+        if (src.shape() != dest.local().shape()) {
+            using mesh_type = typename TOutBase::mesh_type;
+            const auto local_shard_index = mesh_type::local_index();
+            auto node_number_g = ntt::linear_offset(cur_index_g, group_hierarchy);
+            auto new_shape = dest.local().shape();
+            auto starts = dest.sharding().global_offset(dest.shape(), local_shard_index);
             auto viewed_src1_tensor = src.view(starts, new_shape);
-            auto viewed_dest_tensor = dest.view(starts, new_shape);
+            auto viewed_dest_tensor = dest.local();
 
-            auto next_index_g = ntt::unravel_index((node_number_g + i + 1) % group_size, group_hierarchy);
+            // reduce-scatter, communicate (group_size - 1) times
+            for (auto i = 0; i < group_size - 1; i++) 
+            {
+                auto next_index_g = ntt::unravel_index((node_number_g + i + 1) % group_size, group_hierarchy);
 
-            // keep the non-reduce axis invariant.
-            auto next_index = index_group2global(next_index_g, cur_index);
+                // keep the non-reduce axis invariant.
+                auto next_index = index_group2global(next_index_g, cur_index);
 
-            auto src2_tensor = ntt::make_tensor_view_from_address<TElem>(
-                (TElem *)tar::src_ptr_tensor(next_index), src.shape(),
-                src.strides());
-            auto viewed_src2_tensor = src2_tensor.view(starts, new_shape);
+                auto src2_tensor = ntt::make_tensor_view_from_address<TElem>(
+                    (TElem *)tar::src_ptr_tensor(next_index), src.shape(),
+                    src.strides());
+                auto viewed_src2_tensor = src2_tensor.view(starts, new_shape);
 
-            if (i == 0) {
-                reduce_impl(viewed_src1_tensor, viewed_src2_tensor,
-                            viewed_dest_tensor);
-            } else {
-                reduce_impl(viewed_dest_tensor, viewed_src2_tensor,
-                            viewed_dest_tensor);
+                if (i == 0) {
+                    reduce_impl(viewed_src1_tensor, viewed_src2_tensor,
+                                viewed_dest_tensor);
+                } else {
+                    reduce_impl(viewed_dest_tensor, viewed_src2_tensor,
+                                viewed_dest_tensor);
+                }
             }
-        }
-
-        reduce_group_sync();
 
-        // all gather
-        for (size_t i = 0; i < group_size - 1; i++) {
-            auto offset = (node_number_g + i + 1) % (group_size);
-            auto src_index_g = ntt::unravel_index(offset % group_size, group_hierarchy);
-            auto src_index = index_group2global(src_index_g, cur_index);
-
-            auto src_tensor = ntt::make_tensor_view_from_address<TElem>(
-                (TElem *)tar::dest_ptr_tensor(src_index), dest.shape(),
-                dest.strides());
-            auto starts = ntt::generate_shape<Rank>([&](auto j) {
-                if (j == axis) {
-                    return offset * frac;
-                } else {
-                    return (dim_t)0;
+            ntt::tensor_copy_wait<void>();
+            reduce_group_sync(ctx, group_target_value);
+        } else {
+            const auto axis = [&] {
+                dim_t axis = -1;
+                loop<Rank>([&](auto i) {
+                    if (axis == -1 && src.shape()[i] >= group_size) {
+                        axis = i;
+                    }
+                });
+                if (axis == -1) {
+                    axis = 0;
                 }
-            });
-            auto new_shape = ntt::generate_shape<Rank>([&](auto j) {
-                if (j == axis) {
-                    return ntt::where(offset == group_size - 1, frac + remain, frac);
+                return axis;
+            }();
+
+            auto remain = src.shape()[axis] % (group_size);
+            auto frac = src.shape()[axis] / (group_size);
+
+            auto node_number_g = ntt::linear_offset(cur_index_g, group_hierarchy);
+
+            // reduce-scatter, communicate (group_size - 1) times
+            for (auto i = 0; i < group_size - 1; i++) 
+            {
+                auto new_shape = ntt::generate_shape<Rank>([&](auto j) {
+                    if (j == axis) {
+                        return ntt::where(node_number_g == group_size - 1, frac + remain, frac);
+                    } else {
+                        return (dim_t)src.shape()[j];
+                    }
+                });
+                auto starts = ntt::generate_shape<Rank>([&](auto j) {
+                    if (j == axis) {
+                        return node_number_g * frac;
+                    } else {
+                        return (dim_t)0;
+                    }
+                });
+                auto viewed_src1_tensor = src.view(starts, new_shape);
+                auto viewed_dest_tensor = dest.local().view(starts, new_shape);
+
+                auto next_index_g = ntt::unravel_index((node_number_g + i + 1) % group_size, group_hierarchy);
+
+                // keep the non-reduce axis invariant.
+                auto next_index = index_group2global(next_index_g, cur_index);
+
+                auto src2_tensor = ntt::make_tensor_view_from_address<TElem>(
+                    (TElem *)tar::src_ptr_tensor(next_index), src.shape(),
+                    src.strides());
+                auto viewed_src2_tensor = src2_tensor.view(starts, new_shape);
+
+                if (i == 0) {
+                    reduce_impl(viewed_src1_tensor, viewed_src2_tensor,
+                                viewed_dest_tensor);
                 } else {
-                    return (dim_t)src.shape()[j];
+                    reduce_impl(viewed_dest_tensor, viewed_src2_tensor,
+                                viewed_dest_tensor);
                 }
-            });
-            auto viewed_src_tensor = src_tensor.view(starts, new_shape);
-            auto viewed_dest_tensor = dest.view(starts, new_shape);
-            ntt::tensor_copy_async(viewed_src_tensor, viewed_dest_tensor);
-        }
+            }
 
-        ntt::tensor_copy_wait<void>();
-        reduce_group_sync();
+            reduce_group_sync();
+
+            // all gather
+            for (size_t i = 0; i < group_size - 1; i++) {
+                auto offset = (node_number_g + i + 1) % (group_size);
+                auto src_index_g = ntt::unravel_index(offset % group_size, group_hierarchy);
+                auto src_index = index_group2global(src_index_g, cur_index);
+
+                auto src_tensor = ntt::make_tensor_view_from_address<TElem>(
+                    (TElem *)tar::dest_ptr_tensor(src_index), dest.local().shape(),
+                    dest.local().strides());
+                auto starts = ntt::generate_shape<Rank>([&](auto j) {
+                    if (j == axis) {
+                        return offset * frac;
+                    } else {
+                        return (dim_t)0;
+                    }
+                });
+                auto new_shape = ntt::generate_shape<Rank>([&](auto j) {
+                    if (j == axis) {
+                        return ntt::where(offset == group_size - 1, frac + remain, frac);
+                    } else {
+                        return (dim_t)src.shape()[j];
+                    }
+                });
+                auto viewed_src_tensor = src_tensor.view(starts, new_shape);
+                auto viewed_dest_tensor = dest.local().view(starts, new_shape);
+                ntt::tensor_copy_async(viewed_src_tensor, viewed_dest_tensor);
+            }
 
-        if (Op == ntt::reduce_op::mean) {
-            auto numerator = (element_or_scalar_t<TElem>)(size_t)group_size;
-            ntt::binary<ntt::ops::div>(dest, ntt::make_tensor_view_from_address(&numerator, ntt::fixed_shape_v<>), dest);
+            ntt::tensor_copy_wait<void>();
+            reduce_group_sync();
+
+            if (Op == ntt::reduce_op::mean) {
+                auto numerator = (element_or_scalar_t<TElem>)(size_t)group_size;
+                ntt::binary<ntt::ops::div>(dest.local(), ntt::make_tensor_view_from_address(&numerator, ntt::fixed_shape_v<>), dest.local());
+            }
         }
     }
 };

diff --git a/modules/Nncase.Modules.NTT/Evaluator/CustomOp/NTT/LayerNorm.cs b/modules/Nncase.Modules.NTT/Evaluator/CustomOp/NTT/LayerNorm.cs
@@ -2,6 +2,7 @@
 // Licensed under the Apache license. See LICENSE file in the project root for full license information.
 
 using System;
+using System.Diagnostics;
 using System.Linq;
 using System.Runtime.CompilerServices;
 using DryIoc.ImTools;
@@ -47,7 +48,7 @@ public IRType Visit(ITypeInferenceContext context, LayerNorm target)
         {
             return (input, scale, bias) switch
             {
-                (DistributedType a, DistributedType b, DistributedType c) => new DistributedType((TensorType)VisitTensorType(target, a.TensorType, b.TensorType, c.TensorType), target.OutSBPs, a.Placement),
+                (DistributedType a, DistributedType b, DistributedType c) => VisitDistributedType(target, a, b, c),
                 (TensorType a, TensorType b, TensorType c) => VisitTensorType(target, a, b, c),
                 _ => new InvalidType($"{input} {scale} {bias} not support"),
             };
@@ -68,17 +69,17 @@ private bool CheckCustomSBP(IRType input, IRType scale, IRType bias, LayerNorm l
     {
         if (input is DistributedType a && scale is DistributedType b && bias is DistributedType c)
         {
-            if (Enumerable.Range(0, a.TensorType.Shape.Rank).Any(i => a.AxisPolicies[i] != layerNorm.InSBPs[i]))
+            if (Enumerable.Range(0, a.TensorType.Shape.Rank).Any(i => !DistributedUtility.IsSamePolicy(a.AxisPolicies[i], layerNorm.InSBPs[i], false)))
             {
                 return false;
             }
 
-            if (Enumerable.Range(0, b.TensorType.Shape.Rank).Any(i => b.AxisPolicies[i] != layerNorm.ScaleSBPs[i]))
+            if (Enumerable.Range(0, b.TensorType.Shape.Rank).Any(i => !DistributedUtility.IsSamePolicy(b.AxisPolicies[i], layerNorm.ScaleSBPs[i], false)))
             {
                 return false;
             }
 
-            if (Enumerable.Range(0, c.TensorType.Shape.Rank).Any(i => c.AxisPolicies[i] != layerNorm.BiasSBPs[i]))
+            if (Enumerable.Range(0, c.TensorType.Shape.Rank).Any(i => !DistributedUtility.IsSamePolicy(c.AxisPolicies[i], layerNorm.BiasSBPs[i], false)))
             {
                 return false;
             }
@@ -114,4 +115,24 @@ private IRType VisitTensorType(LayerNorm target, TensorType input, TensorType sc
             return new TensorType(target.OutputDataType, input.Shape);
         }
     }
+
+    private IRType VisitDistributedType(LayerNorm target, DistributedType input, DistributedType scale, DistributedType bias)
+    {
+        var tensorType = (TensorType)VisitTensorType(target, input.TensorType, scale.TensorType, bias.TensorType);
+
+        var ndsbps = new SBP[tensorType.Shape.Rank];
+        for (var i = 0; i < ndsbps.Length; i++)
+        {
+            if (i == target.VectorizedAxes[0] && input.AxisPolicies[i] is SBPSplit split)
+            {
+                ndsbps[i] = SBP.S(split.Axes, split.Granularity is null ? null : split.Granularity * ((VectorType)input.TensorType.DType).Lanes[0] / ((VectorType)tensorType.DType).Lanes[0]);
+            }
+            else
+            {
+                ndsbps[i] = input.AxisPolicies[i];
+            }
+        }
+
+        return new DistributedType(tensorType, ndsbps, input.Placement);
+    }
 }
diff --git a/modules/Nncase.Modules.NTT/Evaluator/CustomOp/NTT/Matmul.cs b/modules/Nncase.Modules.NTT/Evaluator/CustomOp/NTT/Matmul.cs
@@ -52,7 +52,7 @@ public IRType Visit(ITypeInferenceContext context, MatMul target)
         {
             return (lhs, rhs) switch
             {
-                (DistributedType a, DistributedType b) => new DistributedType((TensorType)VisitTensorType(target, a.TensorType, b.TensorType, true, dimInfo), target.OutSBPs, a.Placement),
+                (DistributedType a, DistributedType b) => VisitDistributedType(target, a, b, true, dimInfo),
                 (TensorType a, TensorType b) => VisitTensorType(target, a, b, true, dimInfo),
                 _ => new InvalidType($"{lhs} {rhs} not support"),
             };
@@ -78,12 +78,12 @@ private bool CheckCustomSBP(IRType lhs, IRType rhs, IRType extra, MatMul matmul)
 
         if (lhs is DistributedType a && rhs is DistributedType b)
         {
-            if (Enumerable.Range(0, a.TensorType.Shape.Rank).Any(i => a.AxisPolicies[i] != matmul.LhsSBPs[i]))
+            if (Enumerable.Range(0, a.TensorType.Shape.Rank).Any(i => !DistributedUtility.IsSamePolicy(a.AxisPolicies[i], matmul.LhsSBPs[i], false)))
             {
                 return false;
             }
 
-            if (Enumerable.Range(0, b.TensorType.Shape.Rank).Any(i => b.AxisPolicies[i] != matmul.RhsSBPs[i]))
+            if (Enumerable.Range(0, b.TensorType.Shape.Rank).Any(i => !DistributedUtility.IsSamePolicy(b.AxisPolicies[i], matmul.RhsSBPs[i], false)))
             {
                 return false;
             }
@@ -156,4 +156,25 @@ private IRType VisitTensorType(MatMul target, TensorType lhs, TensorType rhs, bo
 
         return new TensorType(dtype, front.Concat(end).ToArray());
     }
+
+    private IRType VisitDistributedType(MatMul target, DistributedType lhs, DistributedType rhs, bool vectorizeK, MatMulDimInfo dimInfo)
+    {
+        var tensorType = (TensorType)VisitTensorType(target, lhs.TensorType, rhs.TensorType, vectorizeK, dimInfo);
+
+        // FIXME: support rank>=2, and only support vectorize N of output.
+        var policyN = rhs.AxisPolicies[dimInfo!.Rn];
+        if (policyN is SBPSplit split)
+        {
+            policyN = SBP.S(split.Axes, split.Granularity is null ? null : split.Granularity / ((VectorType)tensorType.DType).Lanes[0]);
+        }
+
+        var policyM = lhs.AxisPolicies[dimInfo!.Lm];
+        var ndsbps = (target.TransposeA || target.TransposeB) ? new[] { policyN, policyM } : new[] { policyM, policyN };
+        if (DistributedUtility.AreSamePolicies(ndsbps, target.OutSBPs, false))
+        {
+            return new DistributedType(tensorType, ndsbps, lhs.Placement);
+        }
+
+        return new InvalidType("Please Check SBP Scheme.");
+    }
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,7 +35,7 @@ public static string SBPToC(this SBP value) @@
         {
             if (value is SBPSplit s)
             {
-                return $"S<{string.Join(", ", s.Axes)}>()";
+                return $"S<{string.Join(", ", s.Axes)}>({new CSourceConvertVisitor().Visit(s.Granularity as BaseExpr ?? None.Default).Name})";
             }
             else
             {
@@ Expand Down @@