diff --git a/skills/competition/nineops-skills/.vscode/settings.json b/skills/competition/nineops-skills/.vscode/settings.json
new file mode 100644
index 0000000..9fed558
--- /dev/null
+++ b/skills/competition/nineops-skills/.vscode/settings.json
@@ -0,0 +1,58 @@
+{
+    "chat.advanced.cli.mcp.enabled": true,
+    "chat.cli.mcp.enabled": true,
+    "chat.mcp.access": "all",
+    "chat.tools.terminal.autoApprove": {
+        "/^bash\\b/": true,
+        "/^cat\\b/": true,
+        "/^cd\\b/": true,
+        "/^chmod\\b/": true,
+        "/^cp\\b/": true,
+        "/^curl\\b/": true,
+        "/^diff\\b/": true,
+        "/^echo\\b/": true,
+        "/^find\\b/": true,
+        "/^git\\b/": true,
+        "/^go\\b/": true,
+        "/^grep\\b/": true,
+        "/^head\\b/": true,
+        "/^ls\\b/": true,
+        "/^mkdir\\b/": true,
+        "/^mv\\b/": true,
+        "/^node\\b/": true,
+        "/^npm\\b/": true,
+        "/^pip/": true,
+        "/^printf\\b/": true,
+        "/^pwd\\b/": true,
+        "/^python/": true,
+        "/^rm\\b/": true,
+        "/^sed\\b/": true,
+        "/^sh\\b/": true,
+        "/^sort\\b/": true,
+        "/^tail\\b/": true,
+        "/^tee\\b/": true,
+        "/^test\\b/": true,
+        "/^touch\\b/": true,
+        "/^wc\\b/": true,
+        "/^wget\\b/": true,
+        "/^which\\b/": true
+    },
+    "chat.tools.terminal.blockDetectedFileWrites": false,
+    "chat.tools.terminal.ignoreDefaultAutoApproveRules": true,
+    "github.copilot.chat.additionalReadAccessPaths": [
+        "C:\\",
+        "D:\\",
+        "E:\\",
+        "F:\\"
+    ],
+    "github.copilot.enable": {
+        "*": false,
+        "markdown": false,
+        "plaintext": false
+    },
+    "github.copilot.nextEditSuggestions.enabled": false,
+    "github.copilot.nextEditSuggestions.fixes": false,
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}
\ No newline at end of file
diff --git a/skills/competition/nineops-skills/.vscode/settings.json.copilot-hub-bak b/skills/competition/nineops-skills/.vscode/settings.json.copilot-hub-bak
new file mode 100644
index 0000000..c68c115
--- /dev/null
+++ b/skills/competition/nineops-skills/.vscode/settings.json.copilot-hub-bak
@@ -0,0 +1,5 @@
+{
+    "python-envs.defaultEnvManager": "ms-python.python:conda",
+    "python-envs.defaultPackageManager": "ms-python.python:conda",
+    "python-envs.pythonProjects": []
+}
\ No newline at end of file
diff --git a/skills/competition/nineops-skills/CLAUDE.md b/skills/competition/nineops-skills/CLAUDE.md
new file mode 100644
index 0000000..482c400
--- /dev/null
+++ b/skills/competition/nineops-skills/CLAUDE.md
@@ -0,0 +1,50 @@
+# Nineteethed DSL 算子开发 — Agent 快速参考
+
+> **工作流指引**：收到算子开发任务时，先通读 `skill/SKILL.md §0 工作流总览`，按「开发→测试→诊断→修复」四阶段执行。
+> 关键模板在 `skill/templates/`，故障排查参照 `skill/references/failure_diagnosis.md`。
+
+## 核心经验总结（来自 Add / ReLU / GELU 实现）
+
+### AST 跟踪陷阱（最重要的坑）
+application() 内的代码会被 AST 跟踪原样嵌入生成 Triton 代码，Triton 环境没有标准 Python 库。
+- **禁止** `math.*`、`torch.*`、`numpy.*` → 用字面量数值
+- **禁止**模块级变量引用（变量名被原样嵌入导致 NameError）
+- **禁止** `**` 运算符（Triton tensor 无 `__pow__`） → `x * x * x`
+- **禁止** `ntl.tanh`（不存在） → `(exp(t)-exp(-t))/(exp(t)+exp(-t))`
+- **允许** `ntl.*` 函数、字面量数值、四则运算
+
+### 非连续张量支持（关键修复）
+- **不要** `flatten()` → 破坏 strides，转置张量写入错位
+- **要** `tile(tuple(1 for _ in range(ndim-1)) + (block_size,))` → 保留 strides
+- `Tensor(ndim)` 的 ndim 必须与实际张量维度一致
+
+### Element-wise 通用 arrangement 模式
+```python
+def _element_wise_arrangement(*tensors, block_size):
+    ndim = max(tensor.ndim for tensor in tensors)
+    assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors)
+    tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,)
+    return tuple(
+        tensor.tile(tile_shape) if tensor.ndim != 0 else tensor
+        for tensor in tensors
+    )
+```
+
+### GELU 实现要点
+- **近似版**: `0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))`
+  - `sqrt(2/pi)` = `0.7978845608028654`（字面量）
+  - `x^3` = `x * x * x`
+  - `tanh` = 手动用 `ntl.exp`
+  - 测试: `torch.nn.functional.gelu(x, approximate='tanh')`
+- **精确版**: `x * 0.5 * (1 + erf(x / sqrt(2)))`
+  - 使用 `ntl.erf`, `ntl.sqrt`
+  - 测试: `torch.nn.functional.gelu(x)`
+
+### 数据类型支持
+- fp32: atol=1e-5, rtol=1e-5
+- fp16: atol=1e-3, rtol=1e-3 (注意精度损失)
+- bf16: 类似 fp16
+
+### 广播操作
+- 通过 `expand_as` 创建 stride=0 视图
+- Triton 自动处理 HBM 广播
diff --git a/skills/competition/nineops-skills/README.md b/skills/competition/nineops-skills/README.md
new file mode 100644
index 0000000..4f8d3a3
--- /dev/null
+++ b/skills/competition/nineops-skills/README.md
@@ -0,0 +1,131 @@
+# .skill — ninetoothed DSL Agent Workspace
+
+> 本 `.skill` 工作区是 AI Agent 的 **技能包 (Skill Package)**，使 agent 能够高效实现、测试、基准分析和诊断基于 **ninetoothed DSL** 的 GPU 算子。所有文档、模板、脚本、示例均已内置，agent 可自主完成从实作到报告的全流程。
+
+## 概览
+
+| 目录 | 用途 |
+|------|------|
+| `references/` | DSL 模式、测试模式、Benchmark 模式、Repo 索引、AOT 指南、故障诊断 |
+| `scripts/` | 正确性测试、Benchmark、源码检查、日志收集的可执行脚本 |
+| `templates/` | Agent 任务报告、Benchmark 报告、故障诊断报告模板 |
+| `examples/` | 4 个完整示例项目（含源码 + benchmark） |
+| `tests/` | Agent 触发 prompt、自校验任务、期望输出参考 |
+
+## 快速开始
+
+### 1. 实现一个算子
+
+```bash
+# 1a. 参考 DSL 模式
+cat references/dsl_patterns.md
+
+# 1b. 参考已有示例（如 elementwise 加法）
+cat examples/elementwise_broadcast_add/run.py
+
+# 1c. 实现自己的 kernel
+```
+
+### 2. 运行正确性测试
+
+```bash
+scripts/run_correctness.sh examples/elementwise_broadcast_add
+```
+
+### 3. 运行 Benchmark
+
+```bash
+scripts/run_benchmark.sh examples/elementwise_broadcast_add
+```
+
+### 4. 查看生成源
+
+```bash
+scripts/inspect_generated_source.sh examples/elementwise_broadcast_add/run.py
+```
+
+### 5. 收集日志
+
+```bash
+python scripts/collect_task_log.py --dir . --output task_log.json
+```
+
+## 文件结构
+
+```
+.skill/
+├── README.md                        ← 本文档
+├── SKILL.md                         ← DSL 完整 API 参考
+├── references/
+│   ├── repo_index.md                ← ninetoothed 仓库结构索引
+│   ├── dsl_patterns.md              ← 7 种 DSL 实现模式
+│   ├── testing_patterns.md          ← 4 维度测试覆盖策略
+│   ├── benchmark_patterns.md        ← 8 元素 Benchmark 设计
+│   ├── generated_source_and_aot.md  ← Codegen 查看 + AOT 编译
+│   └── failure_diagnosis.md        ← 4 类故障诊断指南
+├── scripts/
+│   ├── validate_skill_package.py    ← 结构完整性检查
+│   ├── run_correctness.sh           ← 正确性测试运行器
+│   ├── run_benchmark.sh             ← Benchmark 运行器
+│   ├── inspect_generated_source.sh  ← 生成源码查看器
+│   └── collect_task_log.py          ← 任务日志收集器
+├── templates/
+│   ├── operator_task_report_template.md   ← 算子任务报告模板
+│   ├── benchmark_report_template.md       ← Benchmark 报告模板
+│   └── failure_diagnosis_template.md      ← 故障诊断模板
+├── examples/
+│   ├── elementwise_broadcast_add/   ← 加法 kernel (elementwise_1d)
+│   ├── reduction_softmax/           ← Softmax kernel (reduction_2d)
+│   ├── non_contiguous_stride_case/  ← 非连续 stride 测试
+│   └── performance_regression_case/ ← BLOCK_SIZE 退化诊断
+└── tests/
+    ├── trigger_prompts.md           ← Agent 触发 prompt
+    ├── selftest_tasks.md            ← 自我校验任务
+    └── expected_outputs.md          ← 期望输出参考
+```
+
+## Agent 工作流程
+
+当 agent 收到"实现一个 XX 算子"的请求时，典型工作流如下：
+
+```
+1. 理解需求 ──→ 打开 references/dsl_patterns.md，匹配模式
+                    │
+2. 查看模板 ──→ 打开 templates/operator_task_report_template.md
+                    │
+3. 参考示例 ──→ 查看 examples/ 下相同模式的实现
+                    │
+4. 实现代码 ──→ 编写 kernel.py + run.py + benchmark.py
+                    │
+5. 正确性测试 ──→ scripts/run_correctness.sh 验证
+                    │
+6. Benchmark  ──→ scripts/run_benchmark.sh 性能对比
+                    │
+7. 查看源码  ──→ scripts/inspect_generated_source.sh 检查
+                    │
+8. 故障诊断  ──→ (如遇错误) 参考 failure_diagnosis.md
+                    │
+9. 生成报告  ──→ 填写 operator_task_report_template 完成
+```
+
+## 环境要求
+
+- Python 3.10+
+- PyTorch 2.0+ (CUDA)
+- ninetoothed (git@github.com:QuantumIntelligence/ninetoothed.git)
+- NVIDIA GPU with CUDA support
+
+## 结构校验
+
+```bash
+python scripts/validate_skill_package.py
+```
+
+预期输出：
+```
+✅ .skill structure OK (所有 5 个目录和核心文件均存在)
+```
+
+## License
+
+Internal — Qiyuan Competition
diff --git a/skills/competition/nineops-skills/SKILL.md b/skills/competition/nineops-skills/SKILL.md
new file mode 100644
index 0000000..52afe64
--- /dev/null
+++ b/skills/competition/nineops-skills/SKILL.md
@@ -0,0 +1,1027 @@
+---
+name: skill
+description: 使用 ninetoothed DSL 开发 Triton 算子的完整指南，涵盖开发→测试→诊断→修复全流程。
+---
+
+---
+
+## 0. 工作流总览（开发 → 测试 → 诊断 → 修复）
+
+这是使用本 skill 包开发算子的完整生命周期。当收到开发算子的任务时，按以下四个阶段依次执行。
+
+---
+
+### Phase 1：开发算子
+
+**步骤**：
+1. **理解算子** — 分析算子的计算逻辑（element-wise / reduction / matmul / norm / attention 等）、输入输出关系、广播/归约操作
+2. **选择 DSL 模式** — 对照下文 §4 Arrangement 模式和 §5 Application 模式，确定对应模板
+3. **复用模板** — 从 `skill/templates/` 选择最接近的模板文件，复制到目标文件
+4. **编写 arrangement** — 根据模式定义数据布局（tile/expand/squeeze 等）
+5. **编写 application** — 使用 `ntl.*` API 编写块内计算逻辑（⚠️ 注意 AST 跟踪陷阱，见 §8）
+6. **声明 Tensor 与 Symbol** — `Tensor(ndim)` 的 ndim 与实际张量维度一致
+7. **创建 kernel** — 调用 `ninetoothed.make(arrangement, application, tensors)`
+8. **编写 torch 包装层** — 参考 §7 的模式，建议使用 flatten_wrapper（非连续安全）
+
+**参考材料**：
+- 模式对照：`skill/references/dsl_patterns.md`
+- 模板文件：`skill/templates/elementwise_1d.py`、`skill/templates/activation.py` 等
+
+---
+
+### Phase 2：测试算子
+
+**编写测试**：在算子所在文件中添加测试套件，覆盖以下场景：
+
+| 维度 | 场景 | 检查点 |
+|------|------|--------|
+| 基础正确性 | contiguous 张量，标准 shape | 与 `torch.*` 或 `F.*` 输出 allclose |
+| 半精度 | fp16 / bf16 输入 | atol=1e-3, rtol=1e-3 |
+| 广播 | expand_as 创建 stride=0 视图 | 正确广播 |
+| 非连续 | `.t()` 转置、`[::2]` 切片 | 数值正确，不 crash |
+
+**运行测试（Linux）**：
+
+```bash
+# Option A：直接运行（推荐开发阶段使用）
+python path/to/your_operator_file.py
+
+# Option B：使用 test 脚本
+bash skill/scripts/run_correctness.sh -f path/to/your_file.py
+
+# Option C：运行 skill 全部测试
+bash skill/scripts/run_correctness.sh
+```
+
+**参考材料**：
+- 测试模式：`skill/references/testing_patterns.md`
+- 验证脚本：`skill/scripts/validate_skill_package.py`
+
+---
+
+### Phase 3：诊断失败
+
+当测试报错或结果不正确时，按以下顺序排查：
+
+**1. 编译错误（Crash / CUDA Error / NameError）**
+
+常见症状与速查：
+
+| 报错信息 | 最可能原因 | 解决方案 |
+|----------|-----------|---------|
+| `NameError: name 'math' is not defined` | application 中引用了 `math.*` | 用字面量数值替代 |
+| `NameError: name 'XXX' is not defined` | 模块级变量被 AST 跟踪 | 将常量值内联到 application |
+| `AttributeError: __pow__` | 使用了 `**` 运算符 | 用 `x * x * x` 替代 |
+| `AttributeError: tanh` | 调用了 `ntl.tanh` | 用 `ntl.exp` 手动实现 |
+| `make()` 编译失败 | Tensor 声明/符号不匹配 | 检查 Tensor ndim 和 Symbol 传递 |
+
+**2. 正确性失败（数值不匹配）**
+
+- **检查 dtype** — 是否需要 `ntl.cast` 到 float32 计算？
+- **检查 BLOCK_SIZE** — 是否整除总元素数？不能整除时需要 mask
+- **检查非连续张量** — arrangement 是否用了 `flatten()`？→ 改为 preserve-ndim tile
+- **检查广播** — 广播维度是否正确 expand？
+
+**3. 诊断工具**
+
+```bash
+# 查看生成的 Triton 源码（用于诊断 AST 嵌入问题）
+bash skill/scripts/inspect_generated_source.sh your_operator_file.py
+
+# 查看日志
+python skill/scripts/collect_task_log.py --output diagnose_log/
+```
+
+**参考材料**：
+- 故障诊断指南：`skill/references/failure_diagnosis.md`
+- 诊断模板：`skill/templates/failure_diagnosis_template.md`
+- 生成源码检查：`skill/references/generated_source_and_aot.md`
+
+---
+
+### Phase 4：修复并回归
+
+1. 根据诊断结果修改代码
+2. **优先使用编辑工具** 进行定点修改（`replace_string_in_file` / `multi_replace_string_in_file`）
+3. 重新运行测试（回到 Phase 2）
+4. 确认全部测试通过后，填写算子任务报告：
+
+```bash
+# 参考模板记录实现过程
+cat skill/templates/operator_task_report_template.md
+```
+
+---
+
+### 全流程示例（以 ReLU 为例）
+
+```bash
+# 1. 开发 — 在 basic_operators.py 中实现 make_relu()
+#    参考 skill/templates/elementwise_1d.py 模板
+
+# 2. 测试 — 运行测试
+python basic_operators.py
+#    输出: ✅ ReLU 测试通过
+
+# 3. 诊断（失败时）
+#    报错: AssertionError: ReLU non-contiguous failed
+#    根因: arrangement 用 flatten() 破坏了 strides
+#    修复: tile_shape = (1,)*(ndim-1) + (block_size,)，使用 preserve-ndim tile
+
+# 4. 回归
+python basic_operators.py
+#    输出: 🎉 所有测试全部通过！
+```
+
+---
+
+## 1. 核心概念
+
+**注意**引入常用的九齿python包
+```python
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor, block_size
+
+```
+
+### 1.1 DSL 三层架构
+
+每个 ninetoothed 算子由三个部分组成：
+
+```
+┌─────────────────────────────────────────┐
+│   torch 包装层                           │
+│   (预处理/后处理、shape 调整、调用 kernel) │
+├─────────────────────────────────────────┤
+│   arrangement 函数                       │
+│   (数据布局、分块 tile、维度变换)          │
+├─────────────────────────────────────────┤
+│   application 函数                       │
+│   (逐块计算逻辑，使用 ntl 语言)            │
+└─────────────────────────────────────────┘
+```
+
+- **torch 包装层**：负责创建 output tensor、reshape/flatten、调用 `kernel()` 并传参
+- **arrangement**：描述每个 Tensor 如何被分成 tile 块，以及块间维度关系
+- **application**：定义每个 tile 内的计算逻辑（对标 Triton kernel 体）
+
+### 1.2 基本流程
+
+```python
+import ninetoothed
+from ninetoothed import Symbol, Tensor
+
+# 1. 定义符号（block 大小等）
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+# 2. 定义数据布局
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.tile((BLOCK_SIZE,)), output.tile((BLOCK_SIZE,))
+
+# 3. 定义计算逻辑
+def application(input, output):
+    output = input * 2  # noqa: F841
+
+# 4. 声明 tensor 元信息
+tensors = (Tensor(1), Tensor(1))
+
+# 5. 创建 kernel
+kernel = ninetoothed.make(arrangement, application, tensors)
+```
+
+---
+
+## 2. Symbol —— 符号参数系统
+
+### 2.1 符号类型
+
+```python
+from ninetoothed import Symbol, block_size
+
+# constexpr —— 编译时常量，在 make() 时确定
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+# meta —— 元参数，Triton autotune 可自动搜索
+BLOCK_SIZE_M = block_size()                    # 等价于 Symbol("BLOCK_SIZE_M", meta=True)
+
+# constexpr + upper_bound —— 常量且有上界，用于窗口/卷积
+WINDOW_HEIGHT = Symbol("WINDOW_HEIGHT", constexpr=True, upper_bound=16)
+WINDOW_WIDTH  = Symbol("WINDOW_WIDTH", constexpr=True, upper_bound=16)
+
+# constexpr + upper_bound —— 用于 scale 等标量
+SCALE = Symbol("SCALE", constexpr=True, upper_bound=128)
+```
+
+### 2.2 符号传递方式
+
+```python
+# 方式1：在 kernel() 调用时传参（适用于 constexpr）
+kernel(input, other, output, BLOCK_SIZE=1024)
+
+# 方式2：由 autotune 自动搜索（适用于 meta/block_size）
+# 无需手动传值
+```
+
+### 2.3 符号命名约定
+
+| 符号名 | 用途 | 推荐类型 |
+|--------|------|----------|
+| `BLOCK_SIZE` | 1D 通用 tile 大小 | `constexpr=True` |
+| `BLOCK_SIZE_M` | 矩阵 M 维度 tile | `block_size()` |
+| `BLOCK_SIZE_N` | 矩阵 N 维度 tile | `block_size()` |
+| `BLOCK_SIZE_K` | 矩阵 K 维度 tile | `block_size()` |
+| `WINDOW_HEIGHT` | 池化/卷积窗口高度 | `constexpr=True, upper_bound=N` |
+| `WINDOW_WIDTH` | 池化/卷积窗口宽度 | `constexpr=True, upper_bound=N` |
+
+---
+
+## 3. Tensor —— 张量元信息声明
+
+### 3.1 Tensor 构造函数
+
+```python
+Tensor(ndim, other=, shape_options=)
+```
+
+| 参数 | 说明 |
+|------|------|
+| `ndim` | 张量维度数（int） |
+| `other=float("-inf")` | 边界填充值（用于 max 或 softmax 等归约操作） |
+| `shape_options=None / dict` | 形状选项，如 `{"constexpr": True}` 或 `{"constexpr": True, "upper_bound": 128}` |
+
+### 3.2 常见 Tensor 声明
+
+```python
+# 标量
+Tensor(0)                              # 0 维张量（eps, beta, alpha, scale）
+
+# 1D 张量
+Tensor(1)                              # 1 维，用于 element-wise
+
+# 2D 张量
+Tensor(2)                              # 2 维矩阵
+Tensor(2, other=float("-inf"))         # 2D + 负无穷边界（softmax）
+
+# 3D 张量
+Tensor(3)                              # 3D（bmm 等）
+
+# 4D 张量
+Tensor(4)                              # 4D（conv2d, rope, attention）
+Tensor(4, shape_options={"constexpr": True})  # 部分维度 constexpr
+
+# 批量 Tensor 声明
+tuple(Tensor(4, shape_options=shape_options) for _ in range(3))
+```
+
+### 3.3 Tensor 元组规范
+
+`tensors` 元组的顺序必须与 `arrangement` 和 `application` 的参数顺序**完全一致**。
+
+```python
+# add: 3 个 1D tensor
+tensors = tuple(Tensor(1) for _ in range(3))
+
+# softmax: 2D input (边界 -inf) + 2D output
+tensors = (Tensor(2, other=float("-inf")), Tensor(2))
+
+# mm: 3 个 2D tensor
+tensors = (Tensor(2), Tensor(2), Tensor(2))
+
+# fused_rms_norm: 两个 2D + 一个标量 + 一个 2D
+tensors = (Tensor(2), Tensor(2), Tensor(0), Tensor(2))
+```
+
+---
+
+## 4. Arrangement —— 数据布局模式大全
+
+### 4.1 基础 tile 模式
+
+#### 模式 A：简单 1D tile（add, silu, swiglu）
+
+所有参与张量沿最后一维均匀分块：
+
+```python
+def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE):
+    input_arranged  = input.tile((BLOCK_SIZE,))
+    other_arranged  = other.tile((BLOCK_SIZE,))
+    output_arranged = output.tile((BLOCK_SIZE,))
+    return input_arranged, other_arranged, output_arranged
+```
+
+#### 模式 A2：非连续安全的 Multi-ND tile（推荐用于 element-wise 算子）
+
+和模式 A 不同，本模式**不进行 flatten**，保留原始 strides，确保 `.T` / `.t()` 等非连续张量的读写正确：
+
+```python
+def _element_wise_arrangement(*tensors, block_size):
+    ndim = max(tensor.ndim for tensor in tensors)
+    assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors)
+    tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,)
+    return tuple(
+        tensor.tile(tile_shape) if tensor.ndim != 0 else tensor
+        for tensor in tensors
+    )
+```
+
+| 区别 | 模式 A（flatten） | 模式 A2（非 flatten） |
+|------|-------------------|----------------------|
+| 张量声明 | `Tensor(1)` | `Tensor(ndim)`（与实际 ndim 一致） |
+| 非连续支持 | ❌ flatten 破坏 strides | ✅ 保留原始 strides |
+| 标量支持 | 需额外处理 | ✅ `tensor.ndim != 0` 自动跳过 |
+
+#### 模式 B：2D 行 tile（softmax, rms_norm）
+
+保留第一维（batch），分块第二维：
+
+```python
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.tile((1, BLOCK_SIZE)), output.tile((1, BLOCK_SIZE))
+```
+
+#### 模式 C：2D 分块 Matmul（mm）
+
+核心技巧：先将 output 按 `(BLOCK_SIZE_M, BLOCK_SIZE_N)` 分块，再让 input 和 other 通过 `tile + expand + squeeze` 与其对齐。
+
+```python
+def arrangement(input, other, output,
+                BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N,
+                BLOCK_SIZE_K=BLOCK_SIZE_K):
+
+    output_arranged = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
+
+    # input: (M, K) -> (BLOCK_SIZE_M, BLOCK_SIZE_K) -> tile(1, -1) -> expand(-1, N_blocks)
+    input_arranged = input.tile((BLOCK_SIZE_M, BLOCK_SIZE_K))
+    input_arranged = input_arranged.tile((1, -1))
+    input_arranged = input_arranged.expand((-1, output_arranged.shape[1]))
+    input_arranged.dtype = input_arranged.dtype.squeeze(0)
+
+    # other: (K, N) -> (BLOCK_SIZE_K, BLOCK_SIZE_N) -> tile(-1, 1) -> expand(M_blocks, -1)
+    other_arranged = other.tile((BLOCK_SIZE_K, BLOCK_SIZE_N))
+    other_arranged = other_arranged.tile((-1, 1))
+    other_arranged = other_arranged.expand((output_arranged.shape[0], -1))
+    other_arranged.dtype = other_arranged.dtype.squeeze(1)
+
+    return input_arranged, other_arranged, output_arranged
+```
+
+#### 模式 D：3D 分块 Matmul（bmm）
+
+在 mm 前加一个 `tile((1, ...))` 保留 batch 维度：
+
+```python
+def arrangement(
+    input, other, output,
+    BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N,
+    BLOCK_SIZE_K=BLOCK_SIZE_K
+):
+    output_arranged = output.tile((1, BLOCK_SIZE_M, BLOCK_SIZE_N))
+    output_arranged.dtype = output_arranged.dtype.squeeze(0)
+
+    input_arranged = input.tile((1, BLOCK_SIZE_M, BLOCK_SIZE_K))
+    input_arranged = input_arranged.tile((1, 1, -1))
+    input_arranged = input_arranged.expand((-1, -1, output_arranged.shape[-1]))
+    input_arranged.dtype = input_arranged.dtype.squeeze((0, 1))
+    input_arranged.dtype.dtype = input_arranged.dtype.dtype.squeeze(0)
+
+    other_arranged = other.tile((1, BLOCK_SIZE_K, BLOCK_SIZE_N))
+    other_arranged = other_arranged.tile((1, -1, 1))
+    other_arranged = other_arranged.expand((-1, output_arranged.shape[-2], -1))
+    other_arranged.dtype = other_arranged.dtype.squeeze((0, 2))
+    other_arranged.dtype.dtype = other_arranged.dtype.dtype.squeeze(0)
+
+    return input_arranged, other_arranged, output_arranged
+```
+
+#### 模式 E：自定义 strides/dilation tile（RoPE）
+
+用于非连续内存访问，如 Rotary Position Embedding 中的交错/非交错模式：
+
+```python
+def arrangement(input, sin_table, cos_table, interleaved=True):
+    emb_dim = input.shape[-1]
+    tile_shape = (1, 1, 1, emb_dim // 2)
+
+    if interleaved:
+        strides = (-1, -1, -1, 1)
+        dilation = (1, 1, 1, 2)
+    else:
+        strides = None
+        dilation = None
+
+    input_arranged = input.tile(tile_shape, strides=strides, dilation=dilation)
+    input_arranged = input_arranged.tile((1, 1, 1, 2))
+    input_arranged.dtype = input_arranged.dtype.squeeze((0, 1, 2))
+    input_arranged.dtype.dtype = input_arranged.dtype.dtype.squeeze((0, 1, 2))
+
+    sin_table_arranged = sin_table.tile(tile_shape)
+    sin_table_arranged.dtype = sin_table_arranged.dtype.squeeze((0, 1, 2))
+
+    cos_table_arranged = cos_table.tile(tile_shape)
+    cos_table_arranged.dtype = cos_table_arranged.dtype.squeeze((0, 1, 2))
+
+    return input_arranged, sin_table_arranged, cos_table_arranged
+```
+
+#### 模式 F：窗口 + ravel + flatten（max_pool2d）
+
+先 tile 窗口，再 ravel+flatten 将窗口内元素展平，最后 tile 做 block：
+
+```python
+def arrangement(input, output):
+    input_arranged = input.tile((1, 1, WINDOW_HEIGHT, WINDOW_WIDTH))
+    input_arranged = input_arranged.ravel()
+    input_arranged = input_arranged.flatten(end_dim=4).flatten(start_dim=1)
+    input_arranged = input_arranged.tile((BLOCK_SIZE, -1))
+
+    output_arranged = output.tile((1, 1, 1, 1))
+    output_arranged = output_arranged.ravel()
+    output_arranged = output_arranged.flatten(end_dim=4).flatten(start_dim=1)
+    output_arranged = output_arranged.tile((BLOCK_SIZE, -1))
+    output_arranged.dtype = output_arranged.dtype.squeeze(1)
+
+    return input_arranged, output_arranged
+```
+
+#### 模式 G：复用作现有 arrangement（addmm）
+
+```python
+# addmm —— 复用 mm.arrangement，额外传 input/beta/alpha
+def arrangement(input, mat1, mat2, beta, alpha, output):
+    _, _, input_arranged = mm.arrangement(mat1, mat2, input)
+    mat1_arranged, mat2_arranged, output_arranged = mm.arrangement(mat1, mat2, output)
+    return input_arranged, mat1_arranged, mat2_arranged, beta, alpha, output_arranged
+```
+
+#### 模式 H：conv2d —— im2col 式 flatten + 复用 mm
+
+```python
+def arrangement(input, filter, output):
+    input_arranged = input.tile((1, *filter.shape[1:]), strides=(-1, -1, 1, 1))
+    input_arranged = input_arranged.squeeze(1)
+    input_arranged.dtype = input_arranged.dtype.squeeze(0)
+    input_arranged = input_arranged.ravel()
+    input_arranged = input_arranged.flatten(end_dim=3).flatten(start_dim=1)
+
+    filter_arranged = filter.flatten(start_dim=1)
+    filter_arranged = filter_arranged.permute((1, 0))
+
+    output_arranged = output.permute((0, 2, 3, 1)).flatten(end_dim=3)
+
+    return mm.arrangement(input_arranged, filter_arranged, output_arranged)
+```
+
+#### 模式 I：Attention —— online softmax
+
+```python
+def arrangement(
+    q, k, v, scale, o, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N
+):
+    def arrange_q_or_o(input):
+        arranged = input.tile((1, 1, BLOCK_SIZE_M, -1))
+        arranged.dtype = arranged.dtype.squeeze((0, 1))
+        return arranged
+
+    def arrange_k_or_v(input):
+        arranged = input.tile((1, 1, BLOCK_SIZE_N, -1))
+        arranged = arranged.tile((1, 1, -1, -1))
+        arranged = arranged.expand((-1, -1, q_arranged.shape[-2], -1))
+        arranged.dtype = arranged.dtype.squeeze((0, 1, 3))
+        arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1))
+        return arranged
+
+    q_arranged = arrange_q_or_o(q)
+    return q_arranged, arrange_k_or_v(k), arrange_k_or_v(v), scale, arrange_q_or_o(o)
+```
+
+### 4.2 Arrangement 方法速查
+
+| 方法 | 作用 | 示例 |
+|------|------|------|
+| `.tile(shape)` | 按 shape 分块 | `tensor.tile((BLOCK_SIZE,))` |
+| `.tile(shape, strides=..., dilation=...)` | 自定义步幅/膨胀分块 | RoPE 交错访问 |
+| `.expand(shape)` | 广播扩展维度（类似 torch.expand） | mm 中 K 维对齐 |
+| `.squeeze(dim)` | 在 arrangement 的 dtype 上下文删除维度 | `dtype.squeeze(0)` |
+| `.ravel()` | 将连续内存 tile 展平为一维 | pool2d 窗口展平 |
+| `.flatten(start_dim, end_dim)` | 展平指定维度范围 | conv2d 的 im2col |
+| `.permute(axes)` | 重排维度顺序 | conv2d filter/output 变换 |
+
+### 4.3 `.dtype` 操作详解
+
+`arranged_tensor.dtype` 是一个代理对象，用来在符号层面描述数据类型和维度关系。
+
+```python
+# 删掉某个轴（该轴大小变为 1 时有效）
+arranged.dtype = arranged.dtype.squeeze(0)
+arranged.dtype = arranged.dtype.squeeze((0, 1))
+
+# 进一步操作深层 dtype（多重 tile/expand 后）
+arranged.dtype.dtype = arranged.dtype.dtype.squeeze(0)
+arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1, 2))
+```
+
+---
+
+## 5. Application —— 计算逻辑模式大全
+
+### 5.1 标量算术模式
+
+```python
+# add —— 逐元素加法
+def application(input, other, output):
+    output = input + other  # noqa: F841
+```
+
+### 5.2 激活函数模式
+
+```python
+# silu: x * sigmoid(x)
+def application(input, output):
+    input_loaded = input
+    output = input_loaded * ntl.sigmoid(ntl.cast(input_loaded, ntl.float32))  # noqa: F841
+
+# swiglu: a * (b * sigmoid(b))
+def application(a, b, c):
+    b_loaded = b
+    gate = b_loaded * ntl.sigmoid(ntl.cast(b_loaded, ntl.float32))
+    c = a * gate  # noqa: F841
+```
+
+# GELU approximate（tanh 近似，用 ntl.exp 手动实现 tanh）
+```python
+def application(input, output):
+    # tanh 近似: 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
+    # 所有数值必须用字面量！不能引用 math.pi 或模块级变量（AST 跟踪陷阱）
+    t = 0.7978845608028654 * (input + 0.044715 * input * input * input)
+    exp_t = ntl.exp(t)
+    exp_neg_t = ntl.exp(-t)
+    output = 0.5 * input * (1.0 + (exp_t - exp_neg_t) / (exp_t + exp_neg_t))  # noqa: F841
+
+# GELU exact（标准 erf 公式）
+```python
+def application(input, output):
+    output = input * 0.5 * (1.0 + ntl.erf(input / ntl.sqrt(2.0)))  # noqa: F841
+```
+
+**GELU 注意事项**：
+- `x ** 3` 不可用（Triton tensor 不支持 `__pow__`） → 用 `x * x * x`
+- `ntl.sqrt(2.0 / math.pi)` 不可用 → 用字面量 `0.7978845608028654`
+- `ntl.tanh` 不可用 → 用 `(exp(t)-exp(-t))/(exp(t)+exp(-t))`
+- 测试对比：`torch.nn.functional.gelu(x, approximate='tanh')`（近似版）和 `torch.nn.functional.gelu(x)`（精确版）
+
+### 5.3 归约模式
+
+```python
+# softmax: online softmax（行级归约）
+def application(input, output):
+    input_loaded = input
+    row_minus_max = input_loaded - ntl.max(input_loaded)
+    numerator = ntl.exp(row_minus_max)
+    denominator = ntl.sum(numerator)
+    output = numerator / denominator  # noqa: F841
+
+# max_pool2d: 窗口内 max
+def application(input, output):
+    output = ntl.max(input, axis=1)  # noqa: F841
+```
+
+### 5.4 矩阵乘模式
+
+```python
+# mm —— 累加 dot product
+def application(input, other, output):
+    accumulator = ntl.zeros(output.shape, dtype=ntl.float32)
+    for k in range(input.shape[0]):
+        accumulator += ntl.dot(input[k], other[k])
+    output = accumulator
+
+# bmm —— 直接复用 mm.application
+application = mm.application
+```
+
+### 5.5 归一化模式
+
+```python
+# rms_norm: x / sqrt(mean(x^2) + eps)
+def application(input, eps, output):
+    input_fp32 = ntl.cast(input, ntl.float32)
+    output = input_fp32 * ntl.rsqrt(  # noqa: F841
+        ntl.sum(input_fp32 * input_fp32) / input.shape[-1] + eps
+    )
+
+# fused_rms_norm: (x / sqrt(mean(x^2) + eps)) * w
+def application(x, w, eps, y):
+    x_fp32 = ntl.cast(x, ntl.float32)
+    y = x_fp32 * ntl.rsqrt(ntl.sum(x_fp32 * x_fp32) / x.shape[-1] + eps) * w  # noqa: F841
+```
+
+### 5.6 Attention 模式（Online Softmax Flash Attention）
+
+```python
+def application(q, k, v, scale, o):
+    # 变换 Q 使其与 K 的乘积可表示为 exp2 形式
+    q_loaded = (q * scale * 1.44269504089).to(q.dtype)  # log2(e)
+    acc = ntl.zeros((q.shape[-2], q.shape[-1]), dtype=ntl.float32)
+    l_i = ntl.full((q.shape[-2],), 1, dtype=ntl.float32)
+    m_i = ntl.full((q.shape[-2],), float("-inf"), dtype=ntl.float32)
+
+    for i in range(k.shape[0]):
+        qk = ntl.dot(q_loaded, ntl.trans(k[i]))
+        qk = ntl.where(k[i].offsets(-2) < k.source.shape[-2], qk, float("-inf"))
+        m_ij = ntl.maximum(m_i, ntl.max(qk, 1))
+        p = ntl.exp2(qk - m_ij[:, None])
+        l_ij = ntl.sum(p, 1)
+        alpha = ntl.exp2(m_i - m_ij)
+        acc = acc * alpha[:, None] + ntl.dot(p.to(v.dtype.dtype), v[i])
+        m_i = m_ij
+        l_i = l_i * alpha + l_ij
+
+    acc /= l_i[:, None]
+    o = acc.to(o.dtype)  # noqa: F841
+```
+
+### 5.7 RoPE 模式（索引级写）
+
+```python
+def application(input, sin_table, cos_table):
+    sin_table_loaded = sin_table
+    cos_table_loaded = cos_table
+    input_0 = input[0]
+    input_1 = input[1]
+    input[0] = input_0 * cos_table_loaded - input_1 * sin_table_loaded
+    input[1] = input_0 * sin_table_loaded + input_1 * cos_table_loaded
+```
+
+### 5.8 Composite 模式（复用现有 application）
+
+```python
+# addmm —— 先用 mm 算 matmul，再加 bias+scale
+def application(input, mat1, mat2, beta, alpha, output):
+    mm.application(mat1, mat2, output)
+    output = beta * input + alpha * output
+```
+
+### 5.9 application 注意事项
+
+- **末尾必须用 `# noqa: F841`** 标注赋值，因为 Triton 在汇编阶段才会实际使用变量
+- `x.shape`、`x.dtype` 在 application 中是符号表达式，不是具体数值/类型
+- `x.source` 可访问到原始未 tile 的张量元信息
+- `.offsets(dim)` 返回当前块在指定维度的起始偏移量
+- 所有运算都是**符号化**的，最终由 Triton JIT 编译为 GPU 代码
+
+---
+
+## 6. Kernel 创建与调用
+
+### 6.1 标准创建
+
+```python
+kernel = ninetoothed.make(arrangement, application, tensors)
+```
+
+### 6.2 多 kernel 分支（RoPE）
+
+```python
+interleaved_kernel = ninetoothed.make(
+    functools.partial(arrangement, interleaved=True), application, inputs
+)
+non_interleaved_kernel = ninetoothed.make(
+    functools.partial(arrangement, interleaved=False), application, inputs
+)
+
+def kernel(input, sin_table, cos_table, interleaved=True):
+    return (interleaved_kernel if interleaved else non_interleaved_kernel)(
+        input, sin_table, cos_table
+    )
+```
+
+### 6.3 调用 kernel
+
+```python
+# constexpr 符号作为 kwargs 传入
+kernel(input, other, output, BLOCK_SIZE=1024)
+
+# meta/block_size 符号无需传入，由 autotune 自动搜索
+kernel(mat1, mat2, output)
+```
+
+---
+
+## 7. Torch 包装层模式
+
+### 7.1 Element-wise（add, silu）
+
+```python
+def add(input, other):
+    output = torch.empty_like(input)
+    ops.ninetoothed.kernels.add.kernel(input, other, output, BLOCK_SIZE=1024)
+    return output
+```
+
+### 7.2 Flatten 后调用（silu, swiglu）
+
+```python
+def silu(input):
+    input_flat = input.flatten()
+    output_flat = torch.empty_like(input_flat)
+    ops.ninetoothed.kernels.silu.kernel(input_flat, output_flat, BLOCK_SIZE=1024)
+    return output_flat.view_as(input)
+```
+
+### 7.3 Matmul（mm, bmm, addmm）
+
+```python
+def mm(input, other):
+    output_shape = (input.shape[0], other.shape[1])
+    output = torch.empty(output_shape, dtype=input.dtype, device=input.device)
+    ops.ninetoothed.kernels.mm.kernel(input, other, output)
+    return output
+```
+
+### 7.4 Norm（rms_norm, fused_rms_norm）
+
+```python
+def fused_rms_norm(x, w, eps=None):
+    if eps is None:
+        eps = torch.finfo(x.dtype).eps()
+    x_2d = x.view(-1, x.shape[-1])
+    w_2d = w.expand_as(x_2d)
+    y_2d = torch.empty_like(x_2d)
+    ops.ninetoothed.kernels.fused_rms_norm.kernel(x_2d, w_2d, eps, y_2d, BLOCK_SIZE=x.shape[-1])
+    return y_2d.view(x.shape)
+```
+
+### 7.5 Conv2d
+
+```python
+def conv2d(input, filter):
+    n, _, h, w = input.shape
+    k, _, r, s = filter.shape
+    p, q = h - r + 1, w - s + 1
+    output = torch.empty((n, k, p, q), dtype=input.dtype, device=input.device)
+    ops.ninetoothed.kernels.conv2d.kernel(input, filter, output)
+    return output
+```
+
+### 7.6 Attention
+
+```python
+def scaled_dot_product_attention(q, k, v, scale=None):
+    if scale is None:
+        scale = 1 / math.sqrt(q.shape[-1])
+    o = torch.empty_like(q)
+    ops.ninetoothed.kernels.scaled_dot_product_attention.kernel(q, k, v, scale, o)
+    return o
+```
+
+### 7.7 Pooling
+
+```python
+def max_pool2d(input, window_shape):
+    n, c, h, w = input.shape
+    r, s = window_shape
+    p = math.ceil((h - r) / r + 1)
+    q = math.ceil((w - s) / s + 1)
+    output = torch.empty(n, c, p, q, dtype=input.dtype, device=input.device)
+    ops.ninetoothed.kernels.max_pool2d.kernel(input, output, WINDOW_HEIGHT=r, WINDOW_WIDTH=s)
+    return output
+```
+
+---
+
+## 8. AST 跟踪陷阱（Nineteethed DSL 特有）
+
+Nineteethed 使用 Python AST 跟踪来生成 Triton 代码。`application()` 函数中**出现的所有 Python 变量名都会被原样嵌入生成的 Triton 代码**，而 Triton 编译环境中没有标准 Python 模块。
+
+### 8.1 常见错误模式
+
+```python
+# ❌ 错误：math 模块在 Triton 中不存在
+def application(input, output):
+    t = ntl.sqrt(2.0 / math.pi) * input  # NameError: name 'math' is not defined
+
+# ❌ 错误：模块级变量也被嵌入
+_SQRT_2_OVER_PI = math.sqrt(2.0 / math.pi)
+def application(input, output):
+    t = _SQRT_2_OVER_PI * input  # NameError: name '_SQRT_2_OVER_PI' is not defined
+
+# ❌ 错误：__pow__ 在 Triton tensor 上不存在
+def application(input, output):
+    t = input ** 3  # AttributeError: 'tensor' object has no attribute '__pow__'
+
+# ❌ 错误：ntl.tanh 不存在
+def application(input, output):
+    output = ntl.tanh(input)  # AttributeError
+```
+
+### 8.2 正确做法
+
+```python
+# ✅ 正确：使用字面量数值，inline 所有常量
+def application(input, output):
+    t = 0.7978845608028654 * (input + 0.044715 * input * input * input)
+    exp_t = ntl.exp(t)
+    exp_neg_t = ntl.exp(-t)
+    output = 0.5 * input * (1.0 + (exp_t - exp_neg_t) / (exp_t + exp_neg_t))  # noqa: F841
+```
+
+### 8.3 规则总结
+
+| 规则 | 说明 |
+|------|------|
+| ✅ 允许 | 字面量数值（`1.0`, `0.5`）、`ntl.*` API、基础运算符（`+`, `-`, `*`, `/`） |
+| ❌ 禁止 | `math.*`、`torch.*`、模块级 Python 变量、`**` 运算符、NumPy 函数 |
+| 🔧 替代方案 | 字面量 inline、`x * x * x` 替代 `x ** 3`、`ntl.exp` 组合替代 `ntl.tanh` |
+
+---
+
+## 9. ntl 语言速查
+
+```python
+import ninetoothed.language as ntl
+```
+
+### 8.1 类型转换
+
+| API | 说明 |
+|-----|------|
+| `ntl.cast(x, ntl.float32)` | 显式类型转换 |
+| `x.to(new_dtype)` | 隐式类型转换 |
+| `ntl.float32` | float32 类型常量 |
+
+### 8.2 数学运算
+
+| API | 说明 |
+|-----|------|
+| `ntl.sigmoid(x)` | Sigmoid 激活函数 |
+| `ntl.exp(x)` | 自然指数 |
+| `ntl.exp2(x)` | 2 的幂 |
+| `ntl.rsqrt(x)` | 1/sqrt(x) |
+| `ntl.max(x)` | 最大值（单输入） |
+| `ntl.max(x, axis=N)` | 沿指定轴最大值 |
+| `ntl.maximum(a, b)` | 逐元素最大值（二元） |
+| `ntl.sum(x)` | 求和 |
+| `ntl.sum(x, axis=N)` | 沿轴求和 |
+| `ntl.dot(a, b)` | 矩阵乘法（二维块级） |
+| `ntl.trans(x)` | 转置 |
+| `ntl.where(cond, x, y)` | 条件选择 |
+
+### 8.3 初始化/创建
+
+| API | 说明 |
+|-----|------|
+| `ntl.zeros(shape, dtype)` | 全零（在 application 内初始化累加器） |
+| `ntl.full(shape, value, dtype)` | 填充指定值 |
+
+### 8.4 张量元信息
+
+| 属性 | 说明 |
+|------|------|
+| `x.shape` | tile 之后的局部形状 |
+| `x.dtype` | tile 之后的数据类型 |
+| `x.source` | 原始（未 tile）张量元信息 |
+| `x.offsets(dim)` | 当前块在 dim 维度的起始偏移 |
+
+---
+
+## 10. 常见模式速查
+
+### 10.1 1D Element-wise 模式
+
+```python
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.tile((BLOCK_SIZE,)), output.tile((BLOCK_SIZE,))
+def application(input, output):
+    output = <FORMULA>  # noqa: F841
+tensors = (Tensor(1), Tensor(1))
+kernel = ninetoothed.make(arrangement, application, tensors)
+```
+
+### 10.2 2D 行归约模式
+
+```python
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.tile((1, BLOCK_SIZE)), output.tile((1, BLOCK_SIZE))
+def application(input, output):
+    output = <REDUCTION>  # noqa: F841
+tensors = (Tensor(2, other=float("-inf")), Tensor(2))
+kernel = ninetoothed.make(arrangement, application, tensors)
+```
+
+### 10.3 Matmul 模式
+
+```python
+BLOCK_SIZE_M = block_size()
+BLOCK_SIZE_N = block_size()
+BLOCK_SIZE_K = block_size()
+# ... mm arrangement + 循环 dot
+tensors = (Tensor(2), Tensor(2), Tensor(2))
+kernel = ninetoothed.make(arrangement, application, tensors)
+```
+
+### 9.4 复用模式
+
+```python
+# 复用 application
+from somewhere import application as base_application
+tensors = ...
+kernel = ninetoothed.make(arrangement, base_application, tensors)
+
+# 复用 arrangement + application
+from somewhere import arrangement as base_arrangement, application as base_application
+tensors = ...
+kernel = ninetoothed.make(base_arrangement, base_application, tensors)
+```
+
+### 9.5 包装层 flatten 模式
+
+```python
+def op(input):
+    flat = input.flatten()
+    out_flat = torch.empty_like(flat)
+    kernel(flat, out_flat, BLOCK_SIZE=1024)
+    return out_flat.view_as(input)
+```
+
+### 9.6 标量参数传递
+
+```python
+# Tensor(0) 标量在 arrangement 中直接返回
+def arrangement(x, eps, y):
+    return x.tile((1, BLOCK_SIZE)), eps, y.tile((1, BLOCK_SIZE))
+
+# kernel 调用时直接传值
+kernel(input, eps_value, output, BLOCK_SIZE=1024)
+```
+
+---
+
+## 11. 完整的算子开发工作流
+
+### 步骤 1：创建 kernel 文件
+
+```
+ops/ninetoothed/kernels/my_op.py
+```
+
+### 步骤 2：实现 arrangement + application
+
+```python
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.tile((BLOCK_SIZE,)), output.tile((BLOCK_SIZE,))
+
+def application(input, output):
+    output = ntl.sigmoid(input)  # noqa: F841
+
+tensors = (Tensor(1), Tensor(1))
+kernel = ninetoothed.make(arrangement, application, tensors)
+```
+
+### 步骤 3：添加 torch 包装
+
+在 `ops/ninetoothed/torch.py` 中添加：
+
+```python
+def my_op(input):
+    flat = input.flatten()
+    out_flat = torch.empty_like(flat)
+    ops.ninetoothed.kernels.my_op.kernel(flat, out_flat, BLOCK_SIZE=1024)
+    return out_flat.view_as(input)
+```
+
+### 步骤 4：测试
+
+```python
+import torch
+from ops.ninetoothed.torch import my_op
+
+x = torch.randn(4, 128, device="cuda")
+result = my_op(x)
+expected = torch.sigmoid(x)
+assert torch.allclose(result, expected, atol=1e-5)
+print("PASS")
+```
+
+---
+
+## 12. 调试技巧
+
+1. **检查 tile 形状**：在 arrangement 中打印 `x.shape` 观察分块是否符合预期
+2. **dtype 操作链**：多层 `tile/expand` 后可能需要 `squeeze` 多次才能恢复正确的 dtype 结构
+3. **noqa: F841**：application 中每个赋值语句都需要标注，否则 Python 字节码分析会警告
+4. **`x.source`**：当需要引用原始张量信息时使用，例如 attention 中的 mask 边界检查
+5. **`offsets(dim)`**：用于在 application 中判断当前块是否越界（如 attention 中 `k[i].offsets(-2) < k.source.shape[-2]`）
+6. **BLOCK_SIZE 选择策略**：
+   - element-wise：256~1024
+   - matmul：M/N 64~128，K 32~64 （通常由 autotune 自动搜索）
+   - softmax/reduction：取 `input.shape[-1]` 作为 BLOCK_SIZE（覆盖整行）
+7. **测试时注意非连续张量**：如果包装层没有显式 `.contiguous()`，需确保 arrangement 正确处理 stride
diff --git a/skills/competition/nineops-skills/examples/elementwise_broadcast_add/README.md b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/README.md
new file mode 100644
index 0000000..8332393
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/README.md
@@ -0,0 +1,33 @@
+# Element-wise Broadcast Add — 示例场景
+
+## 目标
+
+实现一个支持广播的 element-wise add kernel，作为 `elementwise_1d` 模式的展示。
+
+## 任务描述
+
+实现 `ninetoothed_add` 函数，接受两个 tensor，支持广播语义，使用 `elementwise_1d` 模板。
+
+## 关键点
+
+1. **Broadcast** — 第二个输入可以比第一个输入小，需要自动 broadcast
+2. **Mask 处理** — 边界处理
+3. **BLOCK_SIZE** — 使用 1024 或 autotune
+
+## 运行
+
+```bash
+python examples/elementwise_broadcast_add/run.py        # 测试正确性
+python examples/elementwise_broadcast_add/benchmark.py  # 基准测试
+```
+
+## 预期结果
+
+- 所有 correctness 测试通过（fp32/fp16, 各种 shape, 各种 broadcast 场景）
+- Benchmark 至少达到 PyTorch CUDA 80% 的性能
+
+## Torture Test 提示
+
+- scalar broadcast: `y.shape=(1,)` with `x.shape=(N,)`
+- strided broadcast: `x.shape=(128, 64)`, `y.shape=(64,)` broadcasting across dim=-1
+- non-contiguous: `y = y.as_strided(...)` with non-standard strides
diff --git a/skills/competition/nineops-skills/examples/elementwise_broadcast_add/benchmark.py b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/benchmark.py
new file mode 100644
index 0000000..900a7cf
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/benchmark.py
@@ -0,0 +1,132 @@
+"""
+elementwise_broadcast_add/benchmark.py
+Broadcast add kernel 的 benchmark。
+
+对照要素:
+  1. 算子: element-wise add with broadcast
+  2. 输入规模: 多种 shape 和 dtype 组合
+  3. 硬件: 自动检测
+  4. 布局: BLOCK_SIZE=1024
+  5. Baseline: torch.add (CUDA)
+  6. Ninetoothed: 本实现
+  7. 实现差异: tile 布局选择
+  8. Fallback: N/A
+"""
+
+import torch
+from time import perf_counter
+
+# 尝试导入 kernel，失败则 fallback 到 demo
+try:
+    from run import make_broadcast_add_elementwise_1d
+except ImportError:
+    import sys
+    sys.path.insert(0, ".")
+    from run import make_broadcast_add_elementwise_1d
+
+
+def benchmark(shape_a, shape_b, dtype, label, warmup=10, repeats=100):
+    """Benchmark kernel vs torch add."""
+    x = torch.randn(shape_a, device="cuda", dtype=dtype)
+    y = torch.randn(shape_b, device="cuda", dtype=dtype)
+    out = torch.empty(shape_a if len(shape_a) >= len(shape_b) else shape_b,
+                      device="cuda", dtype=dtype)
+    kernel = make_broadcast_add_elementwise_1d()
+    blk = 1024 if max(out.shape) <= 1024 else 2048
+
+    # 预热
+    for _ in range(warmup):
+        kernel(x, y, out, BLOCK_SIZE=blk)
+        x + y
+    torch.cuda.synchronize()
+
+    # 测量 kernel
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel(x, y, out, BLOCK_SIZE=blk)
+    torch.cuda.synchronize()
+    kernel_ms = (perf_counter() - start) / repeats * 1000
+
+    # 测量 torch
+    start = perf_counter()
+    for _ in range(repeats):
+        x + y
+    torch.cuda.synchronize()
+    torch_ms = (perf_counter() - start) / repeats * 1000
+
+    speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0
+
+    print(f"| {label:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |")
+
+    return {"label": label, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup}
+
+
+def main():
+    print("=" * 75)
+    print("Broadcast Add Benchmark")
+    print("=" * 75)
+    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
+    print(f"GPU: {gpu_name}")
+    print(f"CUDA: {torch.version.cuda}")
+    print()
+    print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |")
+    print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|")
+
+    results = []
+
+    # FP32, various sizes
+    sizes = [(1024,), (4096,), (65536,), (131072,)]
+    for s in sizes:
+        results.append(benchmark(s, s, torch.float32, f"({s[0]},) fp32"))
+
+    # FP16
+    results.append(benchmark((4096,), (4096,), torch.float16, "(4096,) fp16"))
+    results.append(benchmark((65536,), (65536,), torch.float16, "(65536,) fp16"))
+
+    # BF16 (if available)
+    if hasattr(torch, 'bfloat16'):
+        try:
+            results.append(benchmark((4096,), (4096,), torch.bfloat16, "(4096,) bf16"))
+        except Exception:
+            print(f"| {'(4096,) bf16':35s} | {'N/A':>8s} | {'N/A':>8s} | {'N/A':>5s} |")
+
+    # 广播场景
+    results.append(benchmark((4096,), (1,), torch.float32, "(4096,) + (1,) scalar"))
+    results.append(benchmark((256, 768), (768,), torch.float32, "(256,768) + (768,) vec"))
+    results.append(benchmark((4, 128, 256), (256,), torch.float32, "(4,128,256) + (256,)"))
+    results.append(benchmark((4, 128, 256), (1,), torch.float32, "(4,128,256) + (1,)"))
+
+    # 非连续场景
+    x = torch.randn(768, 256, device="cuda")
+    y = torch.randn(256, device="cuda")
+    # Transposed
+    x_t = x.t().contiguous().t()  # force non-contiguous
+    kernel = make_broadcast_add_elementwise_1d()
+    out = torch.empty(256, 768, device="cuda")
+
+    warmup, repeats = 10, 100
+    for _ in range(warmup):
+        kernel(x_t, y, out, BLOCK_SIZE=768)
+        x_t + y
+    torch.cuda.synchronize()
+
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel(x_t, y, out, BLOCK_SIZE=768)
+    torch.cuda.synchronize()
+    k_ms = (perf_counter() - start) / repeats * 1000
+
+    start = perf_counter()
+    for _ in range(repeats):
+        x_t + y
+    torch.cuda.synchronize()
+    t_ms = (perf_counter() - start) / repeats * 1000
+
+    print(f"| {'(256,768).T + (768,)':35s} | {k_ms:8.3f} | {t_ms:8.3f} | {t_ms/k_ms:5.2f}x |")
+
+    print()
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/competition/nineops-skills/examples/elementwise_broadcast_add/run.py b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/run.py
new file mode 100644
index 0000000..a351a83
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/run.py
@@ -0,0 +1,134 @@
+"""
+elementwise_broadcast_add/run.py
+Broadcast add kernel: elementwise_1d 模式的标准实现。
+
+测试覆盖:
+  - fp32/fp16/bf16
+  - contiguous / non-contiguous (strided)
+  - broadcast scenarios (scalar, vector, 3D)
+  - shape 不能整除 BLOCK_SIZE 的边界情况
+"""
+
+import torch
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+# ── 方法 A: 使用 elementwise_1d 模板 ──────────────────────────────
+
+def make_broadcast_add_elementwise_1d():
+    """
+    Elementwise-1D 模式: 所有 tensor 沿最后一维做 tile((BLOCK_SIZE,))。
+    """
+    BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+    def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE):
+        return (
+            input.tile((BLOCK_SIZE,)),
+            other.tile((BLOCK_SIZE,)),
+            output.tile((BLOCK_SIZE,)),
+        )
+
+    def application(input, other, output):
+        output = input + other  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(1), Tensor(1), Tensor(1)),
+    )
+
+# ── 方法 B: 手动 broadcast 展开（用于理解和对比）───────────────────
+
+def make_broadcast_add_manual():
+    BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+    def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE):
+        return (
+            input.tile((BLOCK_SIZE,)),
+            other.tile((BLOCK_SIZE,)),
+            output.tile((BLOCK_SIZE,)),
+        )
+
+    def application(input, other, output):
+        output = input + other  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(1), Tensor(1), Tensor(1)),
+    )
+
+
+# ── 使用示例 ──────────────────────────────────────────────────────
+
+def demo():
+    print("=" * 50)
+    print("Broadcast Add — Elementwise 1D 示例")
+    print("=" * 50)
+
+    # 创建 kernel
+    kernel = make_broadcast_add_elementwise_1d()
+    print("✅ Kernel created")
+
+    # 测试 1: 基本加法
+    x = torch.randn(4096, device="cuda")
+    y = torch.randn(4096, device="cuda")
+    out = torch.empty(4096, device="cuda")
+    kernel(x, y, out, BLOCK_SIZE=1024)
+    expected = x + y
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Basic add failed"
+    print("✅ Basic add: contiguous, same shape — PASS")
+
+    # 测试 2: Scalar broadcast
+    y_scalar = torch.randn(1, device="cuda")
+    out = torch.empty(4096, device="cuda")
+    kernel(x, y_scalar, out, BLOCK_SIZE=1024)
+    expected = x + y_scalar
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Scalar broadcast failed"
+    print("✅ Scalar broadcast: (4096,) + (1,) — PASS")
+
+    # 测试 3: Vector broadcast (2D + 1D)
+    x2d = torch.randn(128, 256, device="cuda")
+    y1d = torch.randn(256, device="cuda")
+    kernel_2d = make_broadcast_add_elementwise_1d()
+    out = torch.empty(128, 256, device="cuda")
+    kernel_2d(x2d, y1d, out, BLOCK_SIZE=256)
+    expected = x2d + y1d
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "2D+1D broadcast failed"
+    print("✅ 2D+1D broadcast: (128, 256) + (256,) — PASS")
+
+    # 测试 4: Non-contiguous (transposed)
+    x_t = torch.randn(256, 128, device="cuda").t()  # shape (128, 256)
+    y_t = torch.randn(128, device="cuda")
+    kernel_nc = make_broadcast_add_elementwise_1d()
+    out = torch.empty(128, 256, device="cuda")
+    kernel_nc(x_t, y_t, out, BLOCK_SIZE=256)
+    expected = x_t + y_t
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Non-contiguous broadcast failed"
+    print("✅ Non-contiguous: (128, 256).T + (128,) — PASS")
+
+    # 测试 5: 边界情况 — shape 不能被 BLOCK_SIZE 整除
+    x_small = torch.randn(100, device="cuda")
+    y_small = torch.randn(1, device="cuda")
+    out = torch.empty(100, device="cuda")
+    kernel(x_small, y_small, out, BLOCK_SIZE=128)
+    expected = x_small + y_small
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Uneven shape failed"
+    print("✅ Uneven shape: (100,) + (1,) — PASS")
+
+    # 测试 6: FP16
+    x_fp16 = torch.randn(4096, device="cuda", dtype=torch.float16)
+    y_fp16 = torch.randn(4096, device="cuda", dtype=torch.float16)
+    out = torch.empty(4096, device="cuda", dtype=torch.float16)
+    kernel(x_fp16, y_fp16, out, BLOCK_SIZE=1024)
+    expected = x_fp16 + y_fp16
+    assert torch.allclose(out, expected, atol=1e-3, rtol=1e-3), "FP16 failed"
+    print("✅ FP16: contiguous — PASS")
+
+    print()
+    print("🎉 所有测试通过！")
+
+
+if __name__ == "__main__":
+    demo()
diff --git a/skills/competition/nineops-skills/examples/non_contiguous_stride_case/README.md b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/README.md
new file mode 100644
index 0000000..5e3c927
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/README.md
@@ -0,0 +1,42 @@
+# Non-Contiguous Stride Case — 示例场景
+
+## 目标
+
+测试和验证 ninetoothed 在 non-contiguous 输入下的行为，作为 "stride 正确处理" 的展示。
+
+## 任务描述
+
+ninotoothed 在 non-contiguous tensor 上是否能正确计算 stride 和偏移？本场景通过 add kernel 测试多种 strided 模式。
+
+## 覆盖的 stride 变体
+
+| 变体 | 创建方式 | 预期 |
+|------|----------|------|
+| contiguous | `torch.randn(M, N)` | ✅ 基准 |
+| transposed | `.t()` | ✅ stride 交换 |
+| sliced | `x[::2, :]` | ✅ offset 步进 |
+| view | `.view()` | ✅ 连续视图 |
+| expanded | `.expand()` | ✅ 广播扩展 |
+| permuted | `.permute(1, 0)` | ✅ 维度重排 |
+| as_strided | `.as_strided(...)` | ✅ 自定义 stride |
+
+## 核心问题
+
+对于 non-contiguous tensor，ninetoothed 的 underlying ptr 计算是否为：
+
+```
+ptr + row * stride_row + col * stride_col
+```
+
+而不是：
+
+```
+ptr + row * cols + col   // 仅用于 contiguous
+```
+
+## 运行
+
+```bash
+python examples/non_contiguous_stride_case/run.py
+python examples/non_contiguous_stride_case/benchmark.py
+```
diff --git a/skills/competition/nineops-skills/examples/non_contiguous_stride_case/benchmark.py b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/benchmark.py
new file mode 100644
index 0000000..c36848f
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/benchmark.py
@@ -0,0 +1,78 @@
+"""
+non_contiguous_stride_case/benchmark.py
+Non-contiguous tensor 的 benchmark: 对比 contiguous vs strided 加法的性能差异。
+"""
+
+import torch
+from time import perf_counter
+
+try:
+    from run import make_add_kernel, make_2d_add_kernel
+except ImportError:
+    import sys
+    sys.path.insert(0, ".")
+    from run import make_add_kernel, make_2d_add_kernel
+
+
+def benchmark(name, x, y=None, warmup=10, repeats=100):
+    if y is None:
+        y = torch.randn(x.shape[-1], device="cuda")
+    kernel = make_add_kernel() if x.ndim == 1 else make_2d_add_kernel()
+    out = torch.empty_like(x)
+    BLOCK_SIZE = 1024
+
+    for _ in range(warmup):
+        kernel(x, y, out, BLOCK_SIZE=BLOCK_SIZE)
+        x + y
+    torch.cuda.synchronize()
+
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel(x, y, out, BLOCK_SIZE=BLOCK_SIZE)
+    torch.cuda.synchronize()
+    kernel_ms = (perf_counter() - start) / repeats * 1000
+
+    start = perf_counter()
+    for _ in range(repeats):
+        x + y
+    torch.cuda.synchronize()
+    torch_ms = (perf_counter() - start) / repeats * 1000
+
+    speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0
+    print(f"| {name:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |")
+
+    return {"name": name, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup}
+
+
+def main():
+    print("=" * 75)
+    print("Non-Contiguous Add Benchmark")
+    print("=" * 75)
+    gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
+    print(f"GPU: {gpu} | CUDA: {torch.version.cuda}")
+    print()
+    print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |")
+    print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|")
+
+    M, N = 256, 1024
+
+    baseline = torch.randn(M, N, device="cuda")
+    y = torch.randn(N, device="cuda")
+
+    benchmark(f"contiguous ({M}, {N})", baseline, y)
+    benchmark(f"transposed ({N}, {M}).t()", torch.randn(N, M, device="cuda").t(), y)
+    benchmark(f"sliced rows [::2]", torch.randn(M*2, N, device="cuda")[::2, :], y)
+    benchmark(f"sliced cols [::3]", torch.randn(M, N*3, device="cuda")[:, ::3], y)
+    benchmark(f"both sliced", torch.randn(M*2, N*3, device="cuda")[::2, ::3], y)
+
+    # 广播场景
+    y_scalar = torch.randn(1, device="cuda")
+    benchmark(f"contiguous + scalar (1,)", baseline, y_scalar)
+    benchmark(f"transposed + scalar (1,)", torch.randn(N, M, device="cuda").t(), y_scalar)
+
+    print()
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/competition/nineops-skills/examples/non_contiguous_stride_case/run.py b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/run.py
new file mode 100644
index 0000000..735afba
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/run.py
@@ -0,0 +1,161 @@
+"""
+non_contiguous_stride_case/run.py
+测试 ninetoothed 在 non-contiguous tensor 上的 stride 处理。
+
+使用 add kernel（最简单的 elementwise）来聚焦 stride 行为测试。
+"""
+
+import torch
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+
+def make_add_kernel():
+    """简单的 elementwise add kernel，用于测试 stride。"""
+    BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+    def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE):
+        return (
+            input.tile((BLOCK_SIZE,)),
+            other.tile((BLOCK_SIZE,)),
+            output.tile((BLOCK_SIZE,)),
+        )
+
+    def application(input, other, output):
+        output = input + other  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(1), Tensor(1), Tensor(1)),
+    )
+
+
+def make_2d_add_kernel():
+    """2D version, tile per row."""
+    BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+    def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE):
+        return (
+            input.tile((1, BLOCK_SIZE)),
+            other.tile((1, BLOCK_SIZE)),
+            output.tile((1, BLOCK_SIZE)),
+        )
+
+    def application(input, other, output):
+        output = input + other  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(2), Tensor(2), Tensor(2)),
+    )
+
+
+def test_stride_case(name, x, y=None, kernel=None, BLOCK_SIZE=1024):
+    """通用 stride 测试函数。
+
+    因为现在 kernel 需要 output 参数，由本函数自动分配。
+    """
+    if y is None:
+        y = torch.randn(x.shape[-1], device=x.device)
+    if kernel is None:
+        kernel = make_add_kernel() if x.ndim == 1 else make_2d_add_kernel()
+
+    out = torch.empty_like(x)
+    try:
+        kernel(x, y, out, BLOCK_SIZE=BLOCK_SIZE)
+        expected = x + y
+        ok = torch.allclose(out, expected, atol=1e-5, rtol=1e-5)
+        status = "✅" if ok else "❌"
+        extra = f" | strides: {x.stride()}" if not ok else ""
+        print(f"  {status} {name}{extra}")
+        return ok
+    except Exception as e:
+        print(f"  ❌ {name} — CRASH: {e}")
+        import traceback
+        traceback.print_exc()
+        return False
+
+
+def demo():
+    print("=" * 60)
+    print("Stride / Non-Contiguous 测试")
+    print("=" * 60)
+
+    device = "cuda"
+    results = []
+
+    # ── 1D tests ──
+    print("\n--- 1D ---")
+    kernel_1d = make_add_kernel()
+
+    results.append(test_stride_case("contiguous 1D",
+        torch.randn(4096, device=device), kernel=kernel_1d))
+    results.append(test_stride_case("sliced 1D [::2]",
+        torch.randn(8192, device=device)[::2], kernel=kernel_1d))
+    results.append(test_stride_case("sliced 1D [::3]",
+        torch.randn(12288, device=device)[::3], kernel=kernel_1d))
+    results.append(test_stride_case("small 1D (100)",
+        torch.randn(100, device=device), kernel=kernel_1d))
+
+    # ── 2D tests ──
+    print("\n--- 2D ---")
+    kernel_2d = make_2d_add_kernel()
+
+    results.append(test_stride_case("contiguous (128, 256)",
+        torch.randn(128, 256, device=device), kernel=kernel_2d))
+    results.append(test_stride_case("transposed (256, 128).t()",
+        torch.randn(256, 128, device=device).t(), kernel=kernel_2d))
+    results.append(test_stride_case("sliced rows [::2]",
+        torch.randn(256, 256, device=device)[::2, :], kernel=kernel_2d))
+    results.append(test_stride_case("sliced cols [::2]",
+        torch.randn(128, 512, device=device)[:, ::2], kernel=kernel_2d))
+    results.append(test_stride_case("both sliced [::2, ::3]",
+        torch.randn(256, 768, device=device)[::2, ::3], kernel=kernel_2d))
+
+    # view
+    x_view = torch.randn(128, 256, device=device).view(128, 256)
+    results.append(test_stride_case("view (128, 256)",
+        x_view, kernel=kernel_2d))
+
+    # expanded (broadcast in add)
+    x_base = torch.randn(1, 256, device=device)
+    y_base = torch.randn(128, 1, device=device)
+
+    # For expanded tensor, we create one then test
+    x_exp = x_base.expand(128, 256)
+    y_exp = y_base.expand(128, 256)
+    kernel_exp = make_2d_add_kernel()
+    out = torch.empty_like(x_exp)
+    kernel_exp(x_exp, y_exp, out, BLOCK_SIZE=256)
+    expected = x_exp + y_exp
+    ok = torch.allclose(out, expected, atol=1e-5, rtol=1e-5)
+    status = "✅" if ok else "❌"
+    print(f"  {status} expanded broadcast (1,256)+(128,1) — {'PASS' if ok else 'FAIL'}")
+    results.append(ok)
+
+    # permuted
+    x_perm = torch.randn(128, 256, device=device).permute(1, 0)
+    results.append(test_stride_case("permuted (1,0)",
+        x_perm, kernel=kernel_2d))
+
+    # as_strided
+    x_base2 = torch.randn(256, 256, device=device)
+    x_as = torch.as_strided(x_base2, (128, 128), (512, 2))  # custom stride
+    results.append(test_stride_case("as_strided (128,128) stride(512,2)",
+        x_as, kernel=kernel_2d))
+
+    # ── Summary ──
+    print(f"\n{'=' * 60}")
+    print(f"结果: {sum(results)}/{len(results)} 通过")
+
+    if all(results):
+        print("✅ 所有 stride 变体均正确处理！")
+    else:
+        print(f"❌ {sum(1 for r in results if not r)} 个失败")
+
+
+if __name__ == "__main__":
+    demo()
diff --git a/skills/competition/nineops-skills/examples/performance_regression_case/README.md b/skills/competition/nineops-skills/examples/performance_regression_case/README.md
new file mode 100644
index 0000000..7e17ebc
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/performance_regression_case/README.md
@@ -0,0 +1,35 @@
+# Performance Regression Case — 示例场景
+
+## 目标
+
+模拟和诊断性能退化的场景，作为 "性能基准 + 调试" 流程的完整展示。
+
+## 场景设计
+
+在 Matmul 2D 模式下实现一个矩阵乘法 kernel，然后引入一个常见性能问题（如 block size 过小，或 load/store 模式不高效），通过 benchmark 对比发现问题。
+
+## 流程
+
+1. **基线实现** — 使用默认的 matmul_2d 模式（BLOCK_SIZE=128）
+2. **退化引入** — 切换 BLOCK_SIZE=16（过小的 tile 导致利用率下降）
+3. **诊断** — benchmark 对比，定位性能下降
+4. **修复** — 调整参数或 autotune，恢复性能
+
+## 关键观察
+
+| 配置 | 预期性能 | 原因 |
+|------|----------|------|
+| BLOCK_SIZE=128 | ✅ 高 | 充分利用 SM |
+| BLOCK_SIZE=16 | ❌ 低 | tile 过小，load/store 开销大 |
+| Autotune | ✅ 最高 | Heuristic 选择最优参数 |
+
+## 运行
+
+```bash
+python examples/performance_regression_case/run.py         # verify correctness
+python examples/performance_regression_case/benchmark.py   # compare configs
+```
+
+## 预期输出
+
+Benchmark 应清晰显示 BLOCK_SIZE=16 的严重退化，以及 autotune 恢复性能的效果。
diff --git a/skills/competition/nineops-skills/examples/performance_regression_case/benchmark.py b/skills/competition/nineops-skills/examples/performance_regression_case/benchmark.py
new file mode 100644
index 0000000..43d600f
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/performance_regression_case/benchmark.py
@@ -0,0 +1,126 @@
+"""
+performance_regression_case/benchmark.py
+Matmul kernel 的性能对比 benchmark: 展示 BLOCK_SIZE 对性能的影响。
+"""
+
+import torch
+from time import perf_counter
+
+try:
+    from run import make_matmul, make_matmul_no_autotune
+except ImportError:
+    import sys
+    sys.path.insert(0, ".")
+    from run import make_matmul, make_matmul_no_autotune
+
+
+def benchmark(label, kernel, a, b, warmup=10, repeats=100):
+    c = torch.empty(a.shape[0], b.shape[1], device="cuda")
+    for _ in range(warmup):
+        kernel(a, b, c, bm=16, bn=16, bk=16)
+        a @ b
+    torch.cuda.synchronize()
+
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel(a, b, c, bm=16, bn=16, bk=16)
+    torch.cuda.synchronize()
+    kernel_ms = (perf_counter() - start) / repeats * 1000
+
+    start = perf_counter()
+    for _ in range(repeats):
+        a @ b
+    torch.cuda.synchronize()
+    torch_ms = (perf_counter() - start) / repeats * 1000
+
+    speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0
+    print(f"| {label:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |")
+
+    return {"label": label, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup}
+
+
+def main():
+    print("=" * 75)
+    print("Matmul Performance Regression Benchmark")
+    print("=" * 75)
+    gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
+    print(f"GPU: {gpu} | CUDA: {torch.version.cuda}")
+    print()
+
+    shapes = [
+        ("(512, 512, 512)", 512, 512, 512),
+        ("(1024, 1024, 1024)", 1024, 1024, 1024),
+        ("(2048, 2048, 2048)", 2048, 2048, 2048),
+    ]
+
+    configs = [
+        ("BLOCK=16", 16, 16, 16),
+        ("BLOCK=32", 32, 32, 32),
+        ("BLOCK=64x64x32", 64, 64, 32),
+        ("BLOCK=128x128x32", 128, 128, 32),
+    ]
+
+    for shape_label, M, N, K in shapes:
+        print(f"\nMatrix: {shape_label} | dtype: float32")
+        print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |")
+        print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|")
+
+        a = torch.randn(M, K, device="cuda")
+        b = torch.randn(K, N, device="cuda")
+
+        for cfg_label, bm, bn, bk in configs:
+            kernel = make_matmul_no_autotune(block_size_m=bm, block_size_n=bn, block_size_k=bk)
+            c = torch.empty(M, N, device="cuda")
+            kernel(a, b, c, bm=bm, bn=bn, bk=bk)
+            result = benchmark(cfg_label, kernel, a, b)
+
+        # 也跑 torch 本身的 benchmark 做参考
+        warmup, repeats = 10, 100
+        for _ in range(warmup):
+            a @ b
+        torch.cuda.synchronize()
+        start = perf_counter()
+        for _ in range(repeats):
+            a @ b
+        torch.cuda.synchronize()
+        torch_ms = (perf_counter() - start) / repeats * 1000
+        print(f"| {'PyTorch cuBLAS (baseline)':35s} | {'---':>8s} | {torch_ms:8.3f} | {'1.00x':>6s} |")
+
+    # 关键对比: BLOCK=16 vs BLOCK=128，突出退化
+    print(f"\n{'=' * 75}")
+    print("性能退化摘要 (BLOCK=16 相比 BLOCK=128 的退化倍数):")
+    print(f"{'=' * 75}")
+    for shape_label, M, N, K in shapes:
+        a = torch.randn(M, K, device="cuda")
+        b = torch.randn(K, N, device="cuda")
+
+        k16 = make_matmul_no_autotune(16, 16, 16)
+        k128 = make_matmul_no_autotune(128, 128, 32)
+
+        c16 = torch.empty(M, N, device="cuda")
+        # 测 k16
+        for _ in range(10): k16(a, b, c16, bm=16, bn=16, bk=16)
+        torch.cuda.synchronize()
+        start = perf_counter()
+        for _ in range(100): k16(a, b, c16, bm=16, bn=16, bk=16)
+        torch.cuda.synchronize()
+        ms16 = (perf_counter() - start) / 100 * 1000
+
+        c128 = torch.empty(M, N, device="cuda")
+        # 测 k128
+        for _ in range(10): k128(a, b, c128, bm=128, bn=128, bk=32)
+        torch.cuda.synchronize()
+        start = perf_counter()
+        for _ in range(100): k128(a, b, c128, bm=128, bn=128, bk=32)
+        torch.cuda.synchronize()
+        ms128 = (perf_counter() - start) / 100 * 1000
+
+        ratio = ms16 / ms128 if ms128 > 0 else 0
+        print(f"  {shape_label}: BLOCK=16 {ms16:.2f}ms vs BLOCK=128 {ms128:.2f}ms → {ratio:.1f}x slower")
+
+    print()
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/competition/nineops-skills/examples/performance_regression_case/run.py b/skills/competition/nineops-skills/examples/performance_regression_case/run.py
new file mode 100644
index 0000000..1b3f944
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/performance_regression_case/run.py
@@ -0,0 +1,143 @@
+"""
+performance_regression_case/run.py
+Matmul kernel 在 matmul_2d 模式下演示性能退化诊断。
+
+正确性验证: matmul with BLOCK_SIZE={16, 32, 64, 128} 均通过。
+性能差异通过 benchmark.py 展示。
+"""
+
+import torch
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+
+def make_matmul():
+    """
+    创建 matmul kernel，BLOCK_SIZE 在调用时传入。
+    
+    用法:
+      kernel = make_matmul()
+      out = torch.empty(M, N, device="cuda")
+      kernel(a, b, out, BLOCK_SIZE_M=128, BLOCK_SIZE_N=128, BLOCK_SIZE_K=32)
+    """
+    BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", constexpr=True)
+    BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", constexpr=True)
+    BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", constexpr=True)
+
+    def arrangement(a, b, c, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K):
+        output_arranged = c.tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
+
+        a_arranged = a.tile((BLOCK_SIZE_M, BLOCK_SIZE_K))
+        a_arranged = a_arranged.tile((1, -1))
+        a_arranged = a_arranged.expand((-1, output_arranged.shape[1]))
+        a_arranged.dtype = a_arranged.dtype.squeeze(0)
+
+        b_arranged = b.tile((BLOCK_SIZE_K, BLOCK_SIZE_N))
+        b_arranged = b_arranged.tile((-1, 1))
+        b_arranged = b_arranged.expand((output_arranged.shape[0], -1))
+        b_arranged.dtype = b_arranged.dtype.squeeze(1)
+
+        return a_arranged, b_arranged, output_arranged
+
+    def application(a, b, c):
+        accumulator = ntl.zeros(c.shape, dtype=ntl.float32)
+        for k in range(a.shape[0]):
+            accumulator += ntl.dot(a[k], b[k])
+        c = accumulator  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(2), Tensor(2), Tensor(2)),
+    )
+
+
+def make_matmul_no_autotune(block_size_m=128, block_size_n=128, block_size_k=32):
+    """固定参数版本。"""
+    bm = Symbol("bm", constexpr=True)
+    bn = Symbol("bn", constexpr=True)
+    bk = Symbol("bk", constexpr=True)
+
+    def arrangement(a, b, c, bm=bm, bn=bn, bk=bk):
+        output_arranged = c.tile((bm, bn))
+
+        a_arranged = a.tile((bm, bk))
+        a_arranged = a_arranged.tile((1, -1))
+        a_arranged = a_arranged.expand((-1, output_arranged.shape[1]))
+        a_arranged.dtype = a_arranged.dtype.squeeze(0)
+
+        b_arranged = b.tile((bk, bn))
+        b_arranged = b_arranged.tile((-1, 1))
+        b_arranged = b_arranged.expand((output_arranged.shape[0], -1))
+        b_arranged.dtype = b_arranged.dtype.squeeze(1)
+
+        return a_arranged, b_arranged, output_arranged
+
+    def application(a, b, c):
+        accumulator = ntl.zeros(c.shape, dtype=ntl.float32)
+        for k in range(a.shape[0]):
+            accumulator += ntl.dot(a[k], b[k])
+        c = accumulator  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(2), Tensor(2), Tensor(2)),
+    )
+
+
+def demo():
+    print("=" * 60)
+    print("Performance Regression — Matmul 2D 示例")
+    print("=" * 60)
+
+    M, N, K = 1024, 1024, 1024
+    a = torch.randn(M, K, device="cuda")
+    b = torch.randn(K, N, device="cuda")
+    expected = a @ b
+
+    configs = [
+        ("BLOCK=16", dict(BLOCK_SIZE_M=16, BLOCK_SIZE_N=16, BLOCK_SIZE_K=16)),
+        ("BLOCK=32", dict(BLOCK_SIZE_M=32, BLOCK_SIZE_N=32, BLOCK_SIZE_K=32)),
+        ("BLOCK=64x64x32", dict(BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=32)),
+        ("BLOCK=128x128x32", dict(BLOCK_SIZE_M=128, BLOCK_SIZE_N=128, BLOCK_SIZE_K=32)),
+    ]
+
+    for label, cfg in configs:
+        kernel = make_matmul()
+        c = torch.empty(M, N, device="cuda")
+        kernel(a, b, c,
+               BLOCK_SIZE_M=cfg["BLOCK_SIZE_M"],
+               BLOCK_SIZE_N=cfg["BLOCK_SIZE_N"],
+               BLOCK_SIZE_K=cfg["BLOCK_SIZE_K"])
+        out = c
+        ok = torch.allclose(out, expected, atol=1e-3, rtol=1e-3)
+        status = "✅" if ok else "❌"
+        print(f"  {status} {label} — M={M}, N={N}, K={K}")
+
+    # 小规模也测试一下
+    print()
+    M2, N2, K2 = 256, 512, 128
+    a2 = torch.randn(M2, K2, device="cuda")
+    b2 = torch.randn(K2, N2, device="cuda")
+    expected2 = a2 @ b2
+
+    for label, cfg in configs:
+        kernel = make_matmul()
+        c = torch.empty(M2, N2, device="cuda")
+        kernel(a2, b2, c,
+               BLOCK_SIZE_M=cfg["BLOCK_SIZE_M"],
+               BLOCK_SIZE_N=cfg["BLOCK_SIZE_N"],
+               BLOCK_SIZE_K=cfg["BLOCK_SIZE_K"])
+        out = c
+        ok = torch.allclose(out, expected2, atol=1e-3, rtol=1e-3)
+        status = "✅" if ok else "❌"
+        print(f"  {status} {label} — M={M2}, N={N2}, K={K2}")
+
+    print()
+    print("🎉 所有 matmul 变体正确性验证通过！")
+
+
+if __name__ == "__main__":
+    demo()
diff --git a/skills/competition/nineops-skills/examples/reduction_softmax/README.md b/skills/competition/nineops-skills/examples/reduction_softmax/README.md
new file mode 100644
index 0000000..24198ad
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/reduction_softmax/README.md
@@ -0,0 +1,43 @@
+# Reduction Softmax — 示例场景
+
+## 目标
+
+实现一个行级 online softmax kernel，作为 `reduction_2d` 模式的展示。
+
+## 任务描述
+
+实现 `ninetoothed_softmax` 函数，使用 online softmax 算法，沿最后一维做行归约。
+
+## 关键点
+
+1. **Online Softmax** — 维护 `m_i` 和 `d_i` 两个状态变量实现数值稳定性
+2. **Mask** — `ntl.load` 用 `mask` 和 `other=float("-inf")` 处理边界，避免 `exp` 溢出
+3. **BLOCK_SIZE** — 必须是输入最后一维的大小，以确保一次 load 完整行
+
+## Online Softmax 算法
+
+```
+m_0 = max(0, x[0:BLOCK_SIZE])
+d_0 = exp(x[0:BLOCK_SIZE] - m_0)
+m_prev = m_0; d_prev = sum(d_0)
+output[0:BLOCK_SIZE] = exp(x[0:BLOCK_SIZE] - m_0) / d_prev
+```
+
+## 运行
+
+```bash
+python examples/reduction_softmax/run.py
+python examples/reduction_softmax/benchmark.py
+```
+
+## 预期结果
+
+- 所有 correctness 测试通过
+- 数值精度与 `torch.softmax` 一致（atol=1e-3 for fp16）
+
+## Torture Test 提示
+
+- small shape: `(1, 2)` — 极小的行
+- large shape: `(4, 131072)` — 极大的行
+- non-contiguous: `x[:, ::2]` sliced input
+- 极端值: `x.fill_(1000.0)` — 测试数值稳定性
diff --git a/skills/competition/nineops-skills/examples/reduction_softmax/benchmark.py b/skills/competition/nineops-skills/examples/reduction_softmax/benchmark.py
new file mode 100644
index 0000000..6475614
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/reduction_softmax/benchmark.py
@@ -0,0 +1,117 @@
+"""
+reduction_softmax/benchmark.py
+Softmax kernel 的 benchmark。
+"""
+
+import torch
+from time import perf_counter
+
+try:
+    from run import make_softmax, make_softmax_no_autotune
+except ImportError:
+    import sys
+    sys.path.insert(0, ".")
+    from run import make_softmax, make_softmax_no_autotune
+
+
+def benchmark(shape, dtype, label, warmup=10, repeats=100):
+    x = torch.randn(shape, device="cuda", dtype=dtype)
+    out = torch.empty_like(x)
+    dim = -1
+
+    # 选择 kernel: 如果列数 <= 2048 用 autotune，否则用足够大的 BLOCK_SIZE
+    cols = shape[-1]
+    if cols <= 2048:
+        kernel = make_softmax()
+        kernel_kwargs = {"BLOCK_SIZE": cols}
+    else:
+        bs = 1
+        while bs < cols:
+            bs *= 2
+        kernel = make_softmax_no_autotune()
+        kernel_kwargs = {"BS": bs}
+
+    # 预热
+    for _ in range(warmup):
+        kernel(x, out, **kernel_kwargs)
+        torch.softmax(x, dim=dim)
+    torch.cuda.synchronize()
+
+    # 测量 kernel
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel(x, out, **kernel_kwargs)
+    torch.cuda.synchronize()
+    kernel_ms = (perf_counter() - start) / repeats * 1000
+
+    # 测量 torch
+    start = perf_counter()
+    for _ in range(repeats):
+        torch.softmax(x, dim=dim)
+    torch.cuda.synchronize()
+    torch_ms = (perf_counter() - start) / repeats * 1000
+
+    speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0
+    print(f"| {label:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |")
+
+    return {"label": label, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup}
+
+
+def main():
+    print("=" * 75)
+    print("Softmax Benchmark")
+    print("=" * 75)
+    gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU"
+    print(f"GPU: {gpu_name}")
+    print(f"CUDA: {torch.version.cuda}")
+    print()
+    print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |")
+    print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|")
+
+    # Small: few rows, small cols
+    benchmark((4, 1024), torch.float32, "(4, 1024) fp32")
+    benchmark((4, 4096), torch.float32, "(4, 4096) fp32")
+
+    # Medium
+    benchmark((128, 1024), torch.float32, "(128, 1024) fp32")
+    benchmark((128, 4096), torch.float32, "(128, 4096) fp32")
+
+    # Large
+    benchmark((4, 65536), torch.float32, "(4, 65536) fp32")
+    benchmark((4, 131072), torch.float32, "(4, 131072) fp32")
+
+    # FP16
+    benchmark((128, 1024), torch.float16, "(128, 1024) fp16")
+    benchmark((128, 4096), torch.float16, "(128, 4096) fp16")
+
+    # Non-contiguous
+    x_big = torch.randn(8, 2048, device="cuda", dtype=torch.float32)
+    x_nc = x_big[:, ::2]  # (8, 1024), non-contiguous
+    out_nc = torch.empty(8, 1024, device="cuda", dtype=torch.float32)
+    kernel = make_softmax()
+    warmup, repeats = 10, 100
+    for _ in range(warmup):
+        kernel(x_nc, out_nc, BLOCK_SIZE=1024)
+        torch.softmax(x_nc, dim=-1)
+    torch.cuda.synchronize()
+
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel(x_nc, out_nc, BLOCK_SIZE=1024)
+    torch.cuda.synchronize()
+    k_ms = (perf_counter() - start) / repeats * 1000
+
+    start = perf_counter()
+    for _ in range(repeats):
+        torch.softmax(x_nc, dim=-1)
+    torch.cuda.synchronize()
+    t_ms = (perf_counter() - start) / repeats * 1000
+
+    print(f"| {'(8,2048)[:,::2] non-contiguous':35s} | {k_ms:8.3f} | {t_ms:8.3f} | {t_ms/k_ms:5.2f}x |")
+
+    print()
+    print("Done.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/competition/nineops-skills/examples/reduction_softmax/run.py b/skills/competition/nineops-skills/examples/reduction_softmax/run.py
new file mode 100644
index 0000000..d7a28d5
--- /dev/null
+++ b/skills/competition/nineops-skills/examples/reduction_softmax/run.py
@@ -0,0 +1,156 @@
+"""
+reduction_softmax/run.py
+Online softmax kernel: reduction_2d 模式的标准实现。
+
+算法: Online softmax (numerically stable)
+  1. Load BLOCK_SIZE 元素（整行）
+  2. 计算 m = max(x), d = exp(x - m)
+  3. 累加 sum_d = sum(d)
+  4. output = d / sum_d
+
+测试覆盖:
+  - fp32/fp16
+  - contiguous / non-contiguous
+  - 各种行数/列数组合
+  - 极端值场景
+"""
+
+import torch
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+
+def make_softmax():
+    """
+    reduction_2d 模式: 输入 (M, N)，沿 N 维做归约。
+    tile((1, BLOCK_SIZE)) 保留 M 维，N 维整行处理。
+    BLOCK_SIZE 在 kernel 调用时传入（通常取 input.shape[-1]）。
+    """
+    BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+    def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+        return (
+            input.tile((1, BLOCK_SIZE)),
+            output.tile((1, BLOCK_SIZE)),
+        )
+
+    def application(input, output):
+        x = input
+        m = ntl.max(x, dim=-1, keepdim=True)
+        d = ntl.exp(x - m)  # numerical stable
+        sum_d = ntl.sum(d, dim=-1, keepdim=True)
+        output = d / sum_d  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(2, other=float("-inf")), Tensor(2)),
+    )
+
+
+def make_softmax_no_autotune(BLOCK_SIZE=1024):
+    """固定 BLOCK_SIZE 的版本。传入的 BLOCK_SIZE 在 kernel 调用时以 BS=... 传入。"""
+    BS = Symbol("BS", constexpr=True)
+
+    def arrangement(input, output, BS=BS):
+        return (
+            input.tile((1, BS)),
+            output.tile((1, BS)),
+        )
+
+    def application(input, output):
+        x = input
+        m = ntl.max(x, dim=-1, keepdim=True)
+        d = ntl.exp(x - m)
+        sum_d = ntl.sum(d, dim=-1, keepdim=True)
+        output = d / sum_d  # noqa: F841
+
+    return ninetoothed.make(
+        arrangement,
+        application,
+        tensors=(Tensor(2, other=float("-inf")), Tensor(2)),
+    )
+
+
+def demo():
+    print("=" * 50)
+    print("Softmax — Reduction 2D 示例")
+    print("=" * 50)
+
+    kernel = make_softmax()
+    print("✅ Kernel created")
+
+    # 测试 1: 基本 2D softmax
+    x = torch.randn(4, 1024, device="cuda")
+    out = torch.empty_like(x)
+    kernel(x, out, BLOCK_SIZE=1024)
+    expected = torch.softmax(x, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Basic softmax failed"
+    print("✅ Basic softmax (4, 1024) — PASS")
+
+    # 测试 2: 多行
+    x = torch.randn(128, 1024, device="cuda")
+    out = torch.empty_like(x)
+    kernel(x, out, BLOCK_SIZE=1024)
+    expected = torch.softmax(x, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Multi-row softmax failed"
+    print("✅ Multi-row softmax (128, 1024) — PASS")
+
+    # 测试 3: 列数不能被整除（需要 mask 处理）
+    x_small = torch.randn(4, 999, device="cuda")
+    kernel_small = make_softmax_no_autotune()
+    out = torch.empty_like(x_small)
+    kernel_small(x_small, out, BS=2048)  # 大于 shape[-1] 的 BLOCK_SIZE 测试 mask
+    expected = torch.softmax(x_small, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Uneven col softmax failed"
+    print("✅ Uneven cols (4, 999) with mask — PASS")
+
+    # 测试 4: 单行
+    x = torch.randn(1, 1024, device="cuda")
+    out = torch.empty_like(x)
+    kernel(x, out, BLOCK_SIZE=1024)
+    expected = torch.softmax(x, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Single row softmax failed"
+    print("✅ Single row (1, 1024) — PASS")
+
+    # 测试 5: FP16
+    x_fp16 = torch.randn(4, 1024, device="cuda", dtype=torch.float16)
+    out = torch.empty_like(x_fp16)
+    kernel(x_fp16, out, BLOCK_SIZE=1024)
+    expected = torch.softmax(x_fp16, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-3, rtol=1e-3), "FP16 softmax failed"
+    print("✅ FP16 (4, 1024) — PASS")
+
+    # 测试 6: 极端值（大正值，测试数值稳定性）
+    x_big = torch.full((4, 1024), 1000.0, device="cuda")
+    out = torch.empty_like(x_big)
+    kernel(x_big, out, BLOCK_SIZE=1024)
+    expected = torch.softmax(x_big, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-3, rtol=1e-3), "Extreme values softmax failed"
+    print("✅ Extreme values (4, 1024) fill=1000 — PASS")
+
+    # 测试 7: Non-contiguous (strided)
+    x_nc = torch.randn(8, 2048, device="cuda")[:, ::2]  # shape (8, 1024), non-contiguous
+    kernel_nc = make_softmax()
+    out = torch.empty(8, 1024, device="cuda")
+    kernel_nc(x_nc, out, BLOCK_SIZE=1024)
+    expected = torch.softmax(x_nc, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Non-contiguous softmax failed"
+    print("✅ Non-contiguous (8, 2048)[:, ::2] — PASS")
+
+    # 测试 8: 质数列数
+    x_prime = torch.randn(3, 17, device="cuda")
+    out = torch.empty_like(x_prime)
+    kernel_prime = make_softmax_no_autotune()
+    kernel_prime(x_prime, out, BS=32)
+    expected = torch.softmax(x_prime, dim=-1)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Prime cols softmax failed"
+    print("✅ Prime cols (3, 17) — PASS")
+
+    print()
+    print("🎉 所有测试通过！")
+
+
+if __name__ == "__main__":
+    demo()
diff --git a/skills/competition/nineops-skills/references/benchmark_patterns.md b/skills/competition/nineops-skills/references/benchmark_patterns.md
new file mode 100644
index 0000000..d4cb3ed
--- /dev/null
+++ b/skills/competition/nineops-skills/references/benchmark_patterns.md
@@ -0,0 +1,71 @@
+# Benchmark 设计规范
+
+## 设计原则
+
+Benchmark 的设计目标是对齐以下 8 个对照要素：
+
+1. **算子** — 被测试的具体 kernel
+2. **输入规模** — shape, dtype, broadcast 配置
+3. **硬件信息** — GPU 型号、CUDA 版本
+4. **布局配置** — tile size, block size, autotune 参数
+5. **Baseline** — PyTorch CUDA 实现（或其他 baseline）
+6. **Ninetoothed** — 被测 DSL 生成 kernel
+7. **实现差异** — load/store 模式、tile/block 布局选择
+8. **Fallback 情况** — generated source 或 AOT build 中的回退路径
+
+## Benchmark 输入规模矩阵
+
+| 规模等级 | 典型值 | 说明 |
+|----------|--------|------|
+| S (small) | (128,), (64, 64) | 测试 kernel launch 开销 |
+| M (medium) | (4096,), (256, 768) | 典型推理/训练场景 |
+| L (large) | (65536,), (1024, 1024) | 压力测试 |
+| XL (extra) | (131072,), (4096, 4096) | 极限规模 |
+
+## 测量指标
+
+| 指标 | 单位 | 说明 |
+|------|------|------|
+| latency | ms | 单次调用延迟（预热后均值） |
+| throughput | GB/s | 有效带宽 |
+| TFLOPS | TF/s | 计算吞吐 |
+| kernel time | μs | GPU kernel 实际执行时间 |
+
+## Benchmark 模板
+
+```python
+import torch
+import ninetoothed
+from time import perf_counter
+
+def benchmark_kernel(kernel_fn, torch_fn, *args, warmup=10, repeats=100):
+    # 预热
+    for _ in range(warmup):
+        kernel_fn(*args)
+        torch_fn(*args)
+    torch.cuda.synchronize()
+
+    # 测量 ninetoothed
+    start = perf_counter()
+    for _ in range(repeats):
+        kernel_fn(*args)
+    torch.cuda.synchronize()
+    kernel_time = (perf_counter() - start) / repeats
+
+    # 测量 torch baseline
+    start = perf_counter()
+    for _ in range(repeats):
+        torch_fn(*args)
+    torch.cuda.synchronize()
+    torch_time = (perf_counter() - start) / repeats
+
+    return {"kernel_ms": kernel_time * 1000, "torch_ms": torch_time * 1000}
+```
+
+## 报告格式
+
+所有 benchmark 结果记录为 Markdown 表格，包含：
+
+| Operator | Shape | Dtype | BlockSize | Kernel(ms) | Torch(ms) | Speedup |
+|----------|-------|-------|-----------|------------|-----------|---------|
+| add | (1024,) | fp32 | 1024 | 0.012 | 0.015 | 1.25x |
diff --git a/skills/competition/nineops-skills/references/dsl_patterns.md b/skills/competition/nineops-skills/references/dsl_patterns.md
new file mode 100644
index 0000000..3261927
--- /dev/null
+++ b/skills/competition/nineops-skills/references/dsl_patterns.md
@@ -0,0 +1,130 @@
+# 常见算子 DSL 实现模式
+
+## 1. Element-wise 1D（广播 + 全 tile）
+
+```
+input.tile((BLOCK_SIZE,)) → output.tile((BLOCK_SIZE,))
+```
+
+- 适用于：add, mul, sub, div, silu, gelu, relu
+- 所有输入输出沿最后一维均匀分块
+- BLOCK_SIZE 通常为 1024 或 2048
+
+### 非连续支持（重要）
+
+**问题**：用 `flatten().tile((BLOCK_SIZE,))` 会破坏非连续张量的 strides，
+导致 `torch.empty(N, M).t()` 等转置张量的输出数据错乱。
+
+**方案**：对多维度张量，**不 flatten**，保留原始维度数，只 tile 最后一维：
+
+```python
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    tile_shape = (1,) * (input.ndim - 1) + (BLOCK_SIZE,)
+    return input.tile(tile_shape), output.tile(tile_shape)
+```
+
+同时将 `Tensor(1)` 改为 `Tensor(ndim)` 以匹配实际维度。
+
+### AST 跟踪约束
+
+`application()` 中的 Python 代码会被 AST 跟踪并嵌入 Triton 代码，有以下限制：
+- **禁止** `math.*`、`torch.*` 等 Python 模块调用
+- **禁止**模块级变量引用（变量名被原样嵌入导致 NameError）
+- **禁止** `**` 运算符（Triton tensor 无 `__pow__`）
+- **改用字面量**：`0.7978845608028654` 而非 `math.sqrt(2.0 / math.pi)`
+- `x ** 3` → `x * x * x`
+
+### GELU 激活函数
+
+```python
+# 近似版（tanh 公式，ntl.exp 手动实现 tanh）
+def application(input, output):
+    t = 0.7978845608028654 * (input + 0.044715 * input * input * input)
+    exp_t = ntl.exp(t); exp_neg_t = ntl.exp(-t)
+    output = 0.5 * input * (1.0 + (exp_t - exp_neg_t) / (exp_t + exp_neg_t))  # noqa: F841
+
+# 精确版（erf 公式）
+def application(input, output):
+    output = input * 0.5 * (1.0 + ntl.erf(input / ntl.sqrt(2.0)))  # noqa: F841
+```
+
+## 2. 行归约 2D
+
+```
+input.tile((1, BLOCK_SIZE)) → output.tile((1, BLOCK_SIZE))
+```
+
+- 适用于：softmax, rms_norm, layer_norm
+- 保留第一维（batch/rows），沿第二维做归约
+- BLOCK_SIZE 取 `input.shape[-1]` 以覆盖整行
+
+## 3. Matmul 2D
+
+```
+input:   tile((BM, BK)) → tile((1, -1)) → expand(-1, N_blocks) → squeeze(0)
+other:   tile((BK, BN)) → tile((-1, 1)) → expand(M_blocks, -1) → squeeze(1)
+output:  tile((BM, BN))
+```
+
+- 三个 block_size 符号（BM, BN, BK）由 autotune 搜索
+- application 中用 `ntl.dot` + 循环累加
+
+## 4. BMM（批矩阵乘）
+
+```
+input:  tile((1, BM, BK)) → tile((1, 1, -1)) → expand(-1, -1, N_blocks) → squeeze(1)
+other:  tile((1, BK, BN)) → tile((1, -1, 1)) → expand(-1, M_blocks, -1) → squeeze(2)
+output: tile((1, BM, BN))
+```
+
+- 与 Matmul 2D 的区别：多一个 batch 维度
+- 多分支前缀 `(1, ...)`
+
+## 5. RoPE（stride-dilation 模式）
+
+```
+input:     tile((1, 1, BLOCK_SIZE)) → tile((1, -1, -1))
+  → expand(-1, 2*cos.shape[0], -1) → squeeze(1) → squeeze(0)  # pass
+  → tile((-1, -1)) → tile((1, DILATION))  # query
+
+cos/sin:   tile((1, BLOCK_SIZE)) → tile((-1, -1)) → expand(...)
+```
+
+- 使用 `tile((1, DILATION))` 处理 stride 跳跃访问
+- cos/sin 用 `expand` 广播到 query 的维度
+- 多分支 kernel（多个 kernel 对象由同一个 make 产生）
+
+## 6. MaxPool2D（滑动窗口）
+
+```
+input: tile((1, BLOCK_SIZE, 1, 1)) → tile((1, -1, BLOCK_SIZE_H, BLOCK_SIZE_W))
+       → tile((1, -1, -1, -1)) → expand(-1, -1, -1, input.shape[3])
+```
+
+- 固定 batch=1，在 H 和 W 维度滑动
+- 外层 tile 为 1（保留），内层按窗口大小平铺
+
+## 7. Attention（Flash Attention online softmax）
+
+```
+query: tile((1, BLOCK_SIZE_M, BLOCK_SIZE_K))
+key:   tile((1, BLOCK_SIZE_K, BLOCK_SIZE_N))
+value: tile((1, BLOCK_SIZE_N, BLOCK_SIZE_K))
+output:tile((1, BLOCK_SIZE_M, BLOCK_SIZE_K))
+```
+
+- application 中使用 ntl.dot 计算 score
+- online softmax 维护 m_i, d_i 状态变量
+- loop over key/value blocks
+
+## 通用模式总结
+
+```
+单输入 1D:   Tensor(0 or 1) → tile((N,))
+双输入 1D:   Tensor(0 or 1) × 2 + Tensor(1) → tile((N,))
+行归约 2D:   Tensor(2) × 2 → tile((1, N))
+Matmul:      Tensor(2) × 3 → MM 分块布局
+BMM:         Tensor(3) × 3 → BMM 分块布局
+RoPE:        Tensor(2 or 3) × multi → stride-dilation
+Attention:   Tensor(3) × 4 → online softmax loop
+```
diff --git a/skills/competition/nineops-skills/references/failure_diagnosis.md b/skills/competition/nineops-skills/references/failure_diagnosis.md
new file mode 100644
index 0000000..34dcfa0
--- /dev/null
+++ b/skills/competition/nineops-skills/references/failure_diagnosis.md
@@ -0,0 +1,158 @@
+# 常见故障诊断指南
+
+## 问题分类
+
+### 1. Correctness 失败
+
+**现象**: kernel 输出与 PyTorch baseline 不 match
+
+**诊断步骤**:
+
+1. **检查 dtype** — ntl 中是否做了类型提升？
+   ```python
+   # 好的做法：显式 cast 到 fp32 计算
+   input_f32 = ntl.cast(input, ntl.float32)
+   ```
+
+2. **检查 tile 覆盖** — BLOCK_SIZE 能否整除总元素数？
+   ```
+   总元素数 % BLOCK_SIZE != 0 时 → 需要 mask 处理边界
+   ```
+
+3. **检查 shape 推导** — arrangement 中 shape 是否匹配？
+   ```python
+   # 打印 shape 调试
+   print("input shape:", input.shape)
+   print("output shape:", output.shape)
+   ```
+
+4. **检查 broadcast** — 广播维度是否正确 expand？
+   ```python
+   # 常见错误：直接 tile 而不是先 tile 再 expand
+   ```
+
+5. **检查 stride** — non-contiguous 的 ptr 偏移是否正确？
+
+### 2. Crash / CUDA Error
+
+**现象**: kernel 运行时报错（segfault, illegal memory access）
+
+**诊断步骤**:
+
+1. **检查 mask** — 边界 tile 是否有 mask？
+2. **检查 other** — mask 命中区间的填充值是否合理？
+   ```python
+   other=0  vs  other=float("-inf")  # softmax 需要 -inf
+   ```
+3. **缩小规模** — 用 (1,) 或 (2, 2) 最小 shape 先验证不 crash
+4. **检查 Tensor 声明** — `Tensor(1)` 是否正确反映维度数？
+
+### 3. Performance 不达标
+
+**现象**: kernel 比 PyTorch baseline 慢
+
+**诊断步骤**:
+
+1. **检查 BLOCK_SIZE** — 是否过小或过大？
+   - 过小 → launch overhead 高
+   - 过大 → 占用率高，warp 利用率低
+2. **检查 load/store 模式** — 是否为合并访问？
+3. **检查 autotune** — 是否使用 `block_size()` meta 符号？
+4. **检查 generated source** — 是否有多余的循环或同步？
+
+### 4. Compile 失败
+
+**现象**: `make()` 或 `ninetoothed.make` 报错
+
+**诊断步骤**:
+
+1. **检查符号传递** — 所有 Symbol 是否都在 arrangement 和 application 中正确接收？
+2. **检查 Tensor 数量** — tensors 元组长度是否匹配 arrangement/application 参数？
+3. **检查 layout 展开** — 是否存在不支持的 tile 组合？
+   ```python
+   # 用 debug 模式查看展开过程
+   import ninetoothed; ninetoothed.set_debug(True)
+   ```
+
+## 故障排查速查表
+
+| 症状 | 最可能原因 | 检查点 |
+|------|-----------|--------|
+| 数值不匹配 | dtype/精度问题 | ntl.cast, atol/rtol |
+| 部分元素错误 | 边界 tile 未处理 | mask, other |
+| 全部为 0 | store 未执行 | output 赋值 |
+| Crash | 越界访问 | mask, BLOCK_SIZE |
+| 慢 | BLOCK_SIZE 不当 | autotune |
+| Compile fail | 布局不可展开 | Tensor 声明 |
+
+## 5. AST 跟踪错误（Nineteethed DSL 特有）
+
+**现象**: kernel 编译成功但运行时 NameError / AttributeError
+
+**诊断步骤**:
+
+1. **检查 application 中的 Python 模块引用** — `math.*` 是否出现在 application() 函数内？
+   ```python
+   # 错误示例（在 application 内）
+   def application(input, output):
+       t = ntl.sqrt(2.0 / math.pi) * input  # ❌ NameError: name 'math' is not defined
+   ```
+
+2. **检查 application 中的模块级变量** — 变量名被 AST 跟踪原样嵌入？
+   ```python
+   _SQRT_2_OVER_PI = 0.7978845608028654
+   def application(input, output):
+       t = _SQRT_2_OVER_PI * input  # ❌ NameError: name '_SQRT_2_OVER_PI' is not defined
+   ```
+
+3. **检查 `**` 运算符** — Triton tensor 不支持 `__pow__`
+   ```python
+   # ❌ 错误
+   t = input ** 3  # AttributeError: 'tensor' object has no attribute '__pow__'
+   # ✅ 正确
+   t = input * input * input
+   ```
+
+4. **检查 `ntl.tanh` 等不存在的 API** — 手动用 `ntl.exp` 组合
+   ```python
+   # ❌ 不存在
+   output = ntl.tanh(t)
+   # ✅ 手动实现
+   exp_t = ntl.exp(t); exp_neg_t = ntl.exp(-t)
+   output = (exp_t - exp_neg_t) / (exp_t + exp_neg_t)
+   ```
+
+**根因**: Nineteethed 的 AST 跟踪机制将 application() 内的 Python 代码转换为 Triton 代码，
+但 Triton 编译环境中不存在标准 Python 库（math, torch, numpy）和 Python 模块级变量。
+
+**解决方案**: `application()` 中只使用：`ntl.*` 函数、字面量数值、四则运算符。
+
+## 6. 非连续张量 Correctness 失败
+
+**现象**: kernel 对 contiguous 张量正确，但对 `.T` 或 `.t()` 转置张量数值错误
+
+**根因**: arrangement 中使用了 `flatten()`，破坏了原始 strides，导致 Triton 按连续 strides
+计算 ptr 偏移，在非连续张量上读写错位。
+
+**诊断**: 检查 arrangement 是否调用了 `.flatten()`。
+
+**修复**: 改用 preserve-ndim tile 模式：
+```python
+# ❌ 错误：flatten 破坏 strides
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.flatten().tile((BLOCK_SIZE,)), output.flatten().tile((BLOCK_SIZE,))
+
+# ✅ 正确：保留 ndim，只 tile 最后一维
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    tile_shape = (1,) * (input.ndim - 1) + (BLOCK_SIZE,)
+    return input.tile(tile_shape), output.tile(tile_shape)
+```
+
+同时确保 `Tensor(ndim)` 的 ndim 与实际传入张量的维度一致。
+
+## 日志收集命令
+
+```bash
+# 收集所有调试信息
+python .skill/scripts/collect_task_log.py --output diagnose_log/
+```
diff --git a/skills/competition/nineops-skills/references/generated_source_and_aot.md b/skills/competition/nineops-skills/references/generated_source_and_aot.md
new file mode 100644
index 0000000..a25ba69
--- /dev/null
+++ b/skills/competition/nineops-skills/references/generated_source_and_aot.md
@@ -0,0 +1,95 @@
+# Generated Source 检查与 AOT Build 诊断
+
+## 为什么要检查 generated source？
+
+Ninetoothed 将 DSL 定义编译为 Triton 代码。检查生成的 Triton source 可以：
+
+1. 验证 arrangement 是否正确展开为预期的 tile/block 模式
+2. 发现 load/store 模式的低效问题（如非合并访问）
+3. 检查 AOT build 是否有 fallback 路径
+4. 确认 autotune 参数是否被正确注入
+
+## 查看 generated source
+
+```python
+import ninetoothed
+from ninetoothed.codegen.source import get_generated_source
+
+# 假设 kernel 已创建
+source = get_generated_source(kernel, ...)
+print(source)
+```
+
+或在命令行：
+
+```bash
+# 使用 inspect 脚本
+bash .skill/scripts/inspect_generated_source.sh
+```
+
+## 常见检查点
+
+### 1. Load 指令
+
+检查点：
+- 是否使用 `mask` 处理边界？
+- ptr 计算是否按 stride/offset 正确推导？
+- 是否有非合并访问（stride > 1 的维度的连续 load）？
+
+✅ 好的 load：
+
+```python
+tl.load(input_ptrs + offsets, mask=mask, other=0.0)
+```
+
+❌ 有问题的 load：
+
+```python
+# 缺少 mask，可能导致越界
+tl.load(input_ptrs + offsets)
+```
+
+### 2. Store 指令
+
+检查点：
+- 是否有 output 的 store 操作？
+- mask 是否正确？
+
+### 3. Tile/Block 循环
+
+检查点：
+- 外层循环是否遍历了正确的维度？
+- 内层 dot 累加是否在 K 维度循环？
+
+### 4. AOT build 日志
+
+```bash
+# 查看 AOT compilation 输出
+python -c "
+import ninetoothed
+ninetoothed.make(..., aot=True)
+" 2>&1 | tee aot_build.log
+```
+
+常见 AOT build 问题：
+
+| 问题 | 表现 | 解决方案 |
+|------|------|----------|
+| Triton 编译失败 | `CUDA error: invalid configuration` | 调整 BLOCK_SIZE |
+| 内存越界 | `tl.load` 访问超出分配范围 | 检查 mask / other 填充值 |
+| dtype 不匹配 | load/store 类型不一致 | 显式 cast |
+| Autotune 超时 | 搜索空间过大 | 缩小 block_size 范围 |
+
+## 5. Fallback 诊断
+
+当 detected fallback 时：
+
+```bash
+# 检查 fallback 触发条件
+NINETOOTHED_DEBUG=1 python test_script.py
+```
+
+常见 fallback：
+- Triton 不支持的操作 → 回退到 PyTorch
+- 未安装 CUDA → 回退到 CPU
+- 布局无法展开 → 回退到 element-wise 逐元素
diff --git a/skills/competition/nineops-skills/references/repo_index.md b/skills/competition/nineops-skills/references/repo_index.md
new file mode 100644
index 0000000..e955a96
--- /dev/null
+++ b/skills/competition/nineops-skills/references/repo_index.md
@@ -0,0 +1,72 @@
+# Ninetoothed 仓库代码检索索引
+
+## 核心仓库结构
+
+```
+ninetoothed/
+├── ninetoothed/
+│   ├── __init__.py          # 公开 API：make, Symbol, Tensor, block_size
+│   ├── language/
+│   │   ├── __init__.py      # ntl 语言 API 导出
+│   │   ├── core.py          # 核心操作 (load, store, cast, etc.)
+│   │   └── math.py          # 数学操作 (sigmoid, tanh, exp, etc.)
+│   ├── ir/
+│   │   ├── __init__.py
+│   │   ├── symbol.py        # Symbol 实现
+│   │   ├── tensor.py        # Tensor 声明与 shape 推导
+│   │   ├── arrangement.py   # Arrangement (tile/expand/squeeze)
+│   │   └── application.py   # Application 计算定义
+│   ├── codegen/
+│   │   ├── __init__.py
+│   │   ├── triton.py        # Triton codegen 入口
+│   │   └── source.py        # 源码生成与检查
+│   ├── autotune/
+│   │   └── __init__.py      # 自动调优 (block_size meta 符号)
+│   └── testing/
+│       └── __init__.py      # 测试辅助
+```
+
+## 关键 API 定位
+
+| 符号/API | 文件 | 行号 | 说明 |
+|----------|------|------|------|
+| `Symbol` | `ir/symbol.py` | — | 编译时常量符号 |
+| `block_size` | `autotune/__init__.py` | — | 可 autotune 的 meta 符号 |
+| `Tensor(n)` | `ir/tensor.py` | — | n 维张量声明 |
+| `make()` | `__init__.py` | — | 构建 kernel 主入口 |
+| `ntl.load` | `language/core.py` | — | 从指针加载数据 |
+| `ntl.store` | `language/core.py` | — | 将数据写入指针 |
+| `ntl.cast` | `language/core.py` | — | 类型转换 |
+| `ntl.sigmoid` | `language/math.py` | — | sigmoid 激活 |
+| `ntl.zeros` | `language/core.py` | — | 零初始化 |
+| `tile()` | `ir/arrangement.py` | — | 数据分块 |
+| `expand()` | `ir/arrangement.py` | — | 维度广播扩展 |
+| `squeeze()` | `ir/arrangement.py` | — | 移除单维度 |
+
+## Dot / Matmul 相关
+
+| API | 文件 | 说明 |
+|-----|------|------|
+| `ntl.dot` | `language/core.py` | 矩阵乘法片段 |
+| `ntl.softmax` | `language/math.py` | online softmax 原语 |
+
+## 示例算子索引
+
+| 算子 | 路径 | DSL 模式 |
+|------|------|----------|
+| RoPE | `ops/ninetoothed/kernels/rotary_position_embedding.py` | stride-dilation + 多分支 kernel |
+| Scaled Dot-Product Attn | `ops/ninetoothed/kernels/scaled_dot_product_attention.py` | Flash Attention online softmax |
+| 通用 ops | `ops/ninetoothed/torch.py` | Torch 包装层 |
+
+## 搜索技巧
+
+```bash
+# 查找 ntl 所有导出函数
+grep -rn "def " ninetoothed/language/ --include="*.py"
+
+# 查找 make() 的使用
+grep -rn "ninetoothed.make" examples/ --include="*.py"
+
+# 查找 Symbol 的使用
+grep -rn "Symbol(" examples/ --include="*.py"
+```
diff --git a/skills/competition/nineops-skills/references/testing_patterns.md b/skills/competition/nineops-skills/references/testing_patterns.md
new file mode 100644
index 0000000..750ac85
--- /dev/null
+++ b/skills/competition/nineops-skills/references/testing_patterns.md
@@ -0,0 +1,118 @@
+# Correctness Test 编写规范
+
+## 测试覆盖维度
+
+每个 correctness test **必须**覆盖以下维度：
+
+### 1. dtype
+
+| 测试 | 说明 |
+|------|------|
+| float16 (fp16) | 半精度基础测试 |
+| float32 (fp32) | 单精度基础测试 |
+| bfloat16 (bf16) | BF16 精度测试 |
+
+> 如果算子不支持某些 dtype，在 test plan 中明确标注。
+
+### 2. shape
+
+| 测试 | 说明 |
+|------|------|
+| 最小 shape | 如 (1,), (1, 1) |
+| 典型 shape | 如 (1024,), (256, 768) |
+| 大 shape | 如 (131072,) 测试稳定性 |
+| 非均匀 shape | 如 (3, 7, 11) 质数维度 |
+
+### 3. broadcast
+
+| 测试 | 说明 |
+|------|------|
+| scalar broadcast | `x + 2.0` |
+| vector broadcast | `x + y` where y shape=(1,) |
+| matrix broadcast | `x + y` where y shape=(1, N) |
+| 3D broadcast | `x + y` where y shape=(1, 1, N) |
+| 全广播（不同 ndim） | `x.shape=(3,1,5), y.shape=(1,4,1)` |
+
+### 4. stride / contiguity
+
+| 测试 | 说明 |
+|------|------|
+| contiguous | 标准连续布局 |
+| transposed | `.T` 转置 |
+| sliced | `x[::2]` 等间距切片 |
+| view | `.view()` 变形 |
+| expanded | `.expand()` 扩展 |
+| permuted | `.permute()` 重排 |
+| non-contiguous | `torch.empty(N, M).t()` |
+
+## ndim 参数化测试
+
+对支持任意维度输入（ndim）的算子，使用参数化测试：
+
+```python
+@pytest.mark.parametrize("ndim", [1, 2, 3])
+def test_with_different_ndim(ndim):
+    """测试不同维度的 element-wise 算子"""
+    if ndim == 1:
+        x = torch.randn(4096, device="cuda")
+    elif ndim == 2:
+        x = torch.randn(64, 64, device="cuda")
+    elif ndim == 3:
+        x = torch.randn(16, 16, 16, device="cuda")
+
+    kernel = make_relu(ndim=ndim)
+    out = torch.empty_like(x)
+    kernel(x, out, BLOCK_SIZE=1024)
+    expected = torch.relu(x)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5)
+```
+
+## 非连续张量测试
+
+对 element-wise 算子，必须测试非连续张量场景：
+
+```python
+def test_non_contiguous():
+    """测试 2D 转置（非连续）张量"""
+    x = torch.randn(128, 256, device="cuda").t()  # shape (256, 128)，非连续
+    out = torch.empty_like(x)
+    kernel = make_relu(ndim=2)  # 使用 ndim=2 支持非连续
+    kernel(x, out, BLOCK_SIZE=256)
+    expected = torch.relu(x)
+    assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "非连续张量测试失败"
+```
+
+## 测试断言标准
+
+```python
+# 绝对误差容限
+atol = 1e-3 if dtype == torch.float16 else 1e-5
+rtol = 1e-3 if dtype == torch.float16 else 1e-5
+torch.allclose(output, expected, atol=atol, rtol=rtol)
+```
+
+## 测试文件组织
+
+```
+tests/
+├── test_broadcast_add.py       # Task 1
+├── test_softmax.py             # Task 2
+├── test_non_contiguous.py      # Task 3
+├── test_benchmark.py           # Task 4
+└── conftest.py                 # 共享 fixture 和辅助函数
+```
+
+## conftest.py 推荐内容
+
+```python
+import torch
+import pytest
+
+@pytest.fixture
+def dtype_fixture(request):
+    return request.param if hasattr(request, 'param') else torch.float32
+
+@pytest.fixture
+def device():
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+```
diff --git a/skills/competition/nineops-skills/scripts/collect_task_log.py b/skills/competition/nineops-skills/scripts/collect_task_log.py
new file mode 100644
index 0000000..8d80108
--- /dev/null
+++ b/skills/competition/nineops-skills/scripts/collect_task_log.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+"""
+collect_task_log.py — 收集测试和 benchmark 日志，统一归档。
+
+用法:
+    python collect_task_log.py --output ./diagnose/
+    python collect_task_log.py --task broadcast_add --format json
+
+功能:
+    1. 运行 correctness test，收集输出
+    2. 运行 benchmark，收集结果
+    3. 检查 generated source 并保存
+    4. 汇总成诊断报告
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+
+
+def run_command(cmd: list, timeout: int = 120) -> dict:
+    """运行命令并返回输出。"""
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=timeout,
+        )
+        return {
+            "stdout": result.stdout,
+            "stderr": result.stderr,
+            "returncode": result.returncode,
+            "success": result.returncode == 0,
+        }
+    except subprocess.TimeoutExpired:
+        return {"stdout": "", "stderr": "TIMEOUT", "returncode": -1, "success": False}
+    except FileNotFoundError:
+        return {"stdout": "", "stderr": "Command not found", "returncode": -1, "success": False}
+
+
+def collect_task(task_name: str, script_path: str, output_dir: str):
+    """收集单个任务的日志。"""
+    print(f"收集: {task_name}")
+    result = run_command(["python", script_path])
+
+    log_dir = os.path.join(output_dir, task_name)
+    os.makedirs(log_dir, exist_ok=True)
+
+    # 保存 stdout
+    with open(os.path.join(log_dir, "stdout.log"), "w", encoding="utf-8") as f:
+        f.write(result["stdout"])
+
+    # 保存 stderr
+    with open(os.path.join(log_dir, "stderr.log"), "w", encoding="utf-8") as f:
+        f.write(result["stderr"])
+
+    # 保存 summary
+    summary = {
+        "task": task_name,
+        "timestamp": datetime.now().isoformat(),
+        "success": result["success"],
+        "returncode": result["returncode"],
+        "stdout_lines": len(result["stdout"].splitlines()),
+        "stderr_lines": len(result["stderr"].splitlines()),
+    }
+    with open(os.path.join(log_dir, "summary.json"), "w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2, ensure_ascii=False)
+
+    if result["success"]:
+        print(f"  ✅ 成功 ({summary['stdout_lines']} 行输出)")
+    else:
+        print(f"  ⚠️  失败 (exit={result['returncode']})")
+        if result["stderr"][:500]:
+            print(f"  stderr: {result['stderr'][:500]}")
+
+    return summary
+
+
+def generate_report(output_dir: str, summaries: list):
+    """生成汇总诊断报告。"""
+    lines = [
+        "# 任务日志诊断报告",
+        "",
+        f"生成时间: {datetime.now().isoformat()}",
+        f"工作目录: {os.getcwd()}",
+        "",
+        "## 摘要",
+        "",
+        "| 任务 | 状态 | 输出行数 |",
+        "|------|------|----------|",
+    ]
+
+    success_count = 0
+    for s in summaries:
+        status = "✅" if s["success"] else "❌"
+        if s["success"]:
+            success_count += 1
+        lines.append(f"| {s['task']} | {status} | {s['stdout_lines']} |")
+
+    lines.extend([
+        "",
+        f"总计: {len(summaries)} 个任务, {success_count} 个成功, {len(summaries) - success_count} 个失败",
+        "",
+    ])
+
+    with open(os.path.join(output_dir, "report.md"), "w", encoding="utf-8") as f:
+        f.write("\n".join(lines))
+
+    print(f"\n诊断报告: {os.path.join(output_dir, 'report.md')}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="收集并归档 task 日志")
+    parser.add_argument(
+        "--output",
+        default="diagnose_log",
+        help="输出目录",
+    )
+    parser.add_argument(
+        "--task",
+        default=None,
+        help="只收集特定 task (broadcast_add, softmax 等)",
+    )
+    parser.add_argument(
+        "--format",
+        choices=["text", "json"],
+        default="text",
+        help="输出格式",
+    )
+    args = parser.parse_args()
+
+    skill_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    examples_dir = os.path.join(skill_dir, "examples")
+    output_dir = os.path.abspath(args.output)
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # 定义要收集的任务
+    all_tasks = [
+        ("broadcast_add", os.path.join(examples_dir, "elementwise_broadcast_add", "run.py")),
+        ("softmax", os.path.join(examples_dir, "reduction_softmax", "run.py")),
+        ("non_contiguous", os.path.join(examples_dir, "non_contiguous_stride_case", "run.py")),
+        ("performance_regression", os.path.join(examples_dir, "performance_regression_case", "run.py")),
+    ]
+
+    summaries = []
+
+    for task_name, script_path in all_tasks:
+        if args.task and args.task not in task_name:
+            continue
+        if os.path.exists(script_path):
+            summary = collect_task(task_name, script_path, output_dir)
+            summaries.append(summary)
+        else:
+            print(f"跳过: {task_name} ({script_path} 不存在)")
+
+    if summaries:
+        generate_report(output_dir, summaries)
+
+    if args.format == "json":
+        json_path = os.path.join(output_dir, "summary_all.json")
+        with open(json_path, "w", encoding="utf-8") as f:
+            json.dump(summaries, f, indent=2, ensure_ascii=False)
+        print(f"JSON 汇总: {json_path}")
+
+    print(f"\n所有日志已保存到: {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/competition/nineops-skills/scripts/inspect_generated_source.sh b/skills/competition/nineops-skills/scripts/inspect_generated_source.sh
new file mode 100644
index 0000000..33016d6
--- /dev/null
+++ b/skills/competition/nineops-skills/scripts/inspect_generated_source.sh
@@ -0,0 +1,107 @@
+#!/bin/bash
+# inspect_generated_source.sh — 查看 ninetoothed 生成的 Triton source
+#
+# 用法:
+#   bash scripts/inspect_generated_source.sh              # 交互选择
+#   bash scripts/inspect_generated_source.sh broadcast_add # 直接指定
+#   bash scripts/inspect_generated_source.sh --save output.txt  # 保存到文件
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SKILL_DIR="$(dirname "$SCRIPT_DIR")"
+EXAMPLES_DIR="$SKILL_DIR/examples"
+
+SAVE_FILE=""
+TARGET=""
+
+for arg in "$@"; do
+    case $arg in
+        --save)
+            shift
+            SAVE_FILE="$1"
+            shift
+            ;;
+        *)
+            TARGET="$arg"
+            break
+            ;;
+    esac
+done
+
+echo "=========================================="
+echo "  Generated Source Inspector"
+echo "=========================================="
+echo ""
+
+inspect_source() {
+    local label="$1"
+    local script="$2"
+
+    echo "--- $label ---"
+
+    if [ ! -f "$script" ]; then
+        echo "⚠️  跳过: $script 不存在"
+        return
+    fi
+
+    # 运行脚本并添加环境变量来启用 debug 输出
+    local output
+    output=$(NINETOOTHED_DEBUG=1 python "$script" 2>&1 || true)
+
+    # 尝试提取 generated source 部分
+    local source
+    source=$(echo "$output" | awk '/Generated Source:/,/^$/')
+
+    if [ -z "$source" ]; then
+        source=$(echo "$output" | awk '/TRITON_KERNEL/,/^END_KERNEL/')
+    fi
+
+    if [ -z "$source" ]; then
+        source="$output"  # fallback: 显示全部
+    fi
+
+    echo "$source"
+    echo ""
+
+    if [ -n "$SAVE_FILE" ]; then
+        {
+            echo "=== $label ==="
+            echo "$source"
+            echo ""
+        } >> "$SAVE_FILE"
+    fi
+}
+
+if [ -n "$TARGET" ]; then
+    case "$TARGET" in
+        broadcast_add|add|elementwise)
+            inspect_source "Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/run.py"
+            ;;
+        softmax)
+            inspect_source "Softmax" "$EXAMPLES_DIR/reduction_softmax/run.py"
+            ;;
+        non_contiguous)
+            inspect_source "Non-contiguous Add" "$EXAMPLES_DIR/non_contiguous_stride_case/run.py"
+            ;;
+        regression)
+            inspect_source "Performance Regression" "$EXAMPLES_DIR/performance_regression_case/run.py"
+            ;;
+        *)
+            echo "错误: 未知 target '$TARGET'"
+            echo "可用: broadcast_add, softmax, non_contiguous, regression"
+            exit 1
+            ;;
+    esac
+else
+    inspect_source "Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/run.py"
+    inspect_source "Softmax" "$EXAMPLES_DIR/reduction_softmax/run.py"
+    inspect_source "Non-contiguous Add" "$EXAMPLES_DIR/non_contiguous_stride_case/run.py"
+    inspect_source "Performance Regression" "$EXAMPLES_DIR/performance_regression_case/run.py"
+fi
+
+echo "---"
+if [ -n "$SAVE_FILE" ]; then
+    echo "已保存到: $SAVE_FILE"
+fi
+echo "Inspect Done."
diff --git a/skills/competition/nineops-skills/scripts/run_benchmark.sh b/skills/competition/nineops-skills/scripts/run_benchmark.sh
new file mode 100644
index 0000000..57116a4
--- /dev/null
+++ b/skills/competition/nineops-skills/scripts/run_benchmark.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+# run_benchmark.sh — 运行 benchmark 套件
+#
+# 用法:
+#   bash scripts/run_benchmark.sh                          # 运行所有 benchmark
+#   bash scripts/run_benchmark.sh softmax                  # 运行特定 benchmark
+#   bash scripts/run_benchmark.sh --output results.md      # 输出到文件
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SKILL_DIR="$(dirname "$SCRIPT_DIR")"
+EXAMPLES_DIR="$SKILL_DIR/examples"
+
+# 解析参数
+OUTPUT_FILE=""
+VERBOSE=""
+TARGET=""
+
+for arg in "$@"; do
+    case $arg in
+        --output)
+            shift
+            OUTPUT_FILE="$1"
+            shift
+            ;;
+        --verbose)
+            VERBOSE="--verbose"
+            shift
+            ;;
+        *)
+            TARGET="$arg"
+            break
+            ;;
+    esac
+done
+
+echo "=========================================="
+echo "  Ninetoothed Benchmark Runner"
+echo "=========================================="
+echo "Output: ${OUTPUT_FILE:-stdout}"
+echo ""
+
+benchmark_results=""
+
+run_bench() {
+    local label="$1"
+    local script="$2"
+
+    echo ""
+    echo "--- Benchmark: $label ---"
+
+    if [ -f "$script" ]; then
+        local output
+        output=$(python "$script" 2>&1)
+        echo "$output"
+
+        if [ -n "$OUTPUT_FILE" ]; then
+            benchmark_results+="
+## $label
+
+\`\`\`
+$output
+\`\`\`
+"
+        fi
+    else
+        echo "⚠️  跳过: $script 不存在"
+    fi
+}
+
+if [ -n "$TARGET" ]; then
+    # 运行特定案例
+    case "$TARGET" in
+        elementwise|broadcast_add|add)
+            run_bench "Element-wise Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/benchmark.py"
+            ;;
+        softmax|reduction)
+            run_bench "Reduction Softmax" "$EXAMPLES_DIR/reduction_softmax/benchmark.py"
+            ;;
+        non_contiguous)
+            run_bench "Non-contiguous Stride Case" "$EXAMPLES_DIR/non_contiguous_stride_case/benchmark.py"
+            ;;
+        regression)
+            run_bench "Performance Regression Case" "$EXAMPLES_DIR/performance_regression_case/benchmark.py"
+            ;;
+        *)
+            echo "错误: 未知 target '$TARGET'"
+            echo "可用: elementwise, softmax, non_contiguous, regression"
+            exit 1
+            ;;
+    esac
+else
+    # 运行所有
+    run_bench "Element-wise Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/benchmark.py"
+    run_bench "Reduction Softmax" "$EXAMPLES_DIR/reduction_softmax/benchmark.py"
+    run_bench "Non-contiguous Stride Case" "$EXAMPLES_DIR/non_contiguous_stride_case/benchmark.py"
+    run_bench "Performance Regression Case" "$EXAMPLES_DIR/performance_regression_case/benchmark.py"
+fi
+
+# 写入输出文件
+if [ -n "$OUTPUT_FILE" ]; then
+    {
+        echo "# Benchmark Results"
+        echo ""
+        echo "Date: $(date)"
+        echo "Host: $(hostname 2>/dev/null || echo 'unknown')"
+        echo "GPU: $(python -c 'import torch; print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\")' 2>/dev/null || echo 'unknown')"
+        echo ""
+        echo "$benchmark_results"
+    } > "$OUTPUT_FILE"
+
+    echo ""
+    echo "结果已保存到: $OUTPUT_FILE"
+fi
+
+echo ""
+echo "=========================================="
+echo "  Benchmark Done."
+echo "=========================================="
diff --git a/skills/competition/nineops-skills/scripts/run_correctness.py b/skills/competition/nineops-skills/scripts/run_correctness.py
new file mode 100644
index 0000000..6ca4bef
--- /dev/null
+++ b/skills/competition/nineops-skills/scripts/run_correctness.py
@@ -0,0 +1,222 @@
+#!/usr/bin/env python3
+"""
+run_correctness.py — 跨平台正确性测试运行器（Python 实现）
+
+替代 run_correctness.sh，在 Windows/Linux/Mac 上均可运行。
+
+用法:
+    python scripts/run_correctness.py                    # 运行所有测试
+    python scripts/run_correctness.py softmax            # 运行 examples 中指定算子的测试
+    python scripts/run_correctness.py --verbose          # 详细输出
+    python scripts/run_correctness.py --file path/to/test.py  # 运行指定文件
+"""
+
+import argparse
+import os
+import subprocess
+import sys
+import time
+
+
+def get_skill_dir():
+    """获取 skill 包根目录。"""
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    return os.path.dirname(script_dir)
+
+
+def discover_example_tests(skill_dir):
+    """扫描 examples/ 下所有 run.py 和 benchmark.py。"""
+    examples_dir = os.path.join(skill_dir, "examples")
+    tests = []
+    if os.path.isdir(examples_dir):
+        for name in os.listdir(examples_dir):
+            example_dir = os.path.join(examples_dir, name)
+            if os.path.isdir(example_dir):
+                for fname in ("run.py", "benchmark.py"):
+                    fpath = os.path.join(example_dir, fname)
+                    if os.path.isfile(fpath):
+                        tests.append((f"{name}/{fname}", fpath))
+    return tests
+
+
+def discover_tests_dir_tests(skill_dir):
+    """扫描 tests/ 目录下的 .py 文件。"""
+    tests_dir = os.path.join(skill_dir, "tests")
+    tests = []
+    if os.path.isdir(tests_dir):
+        for fname in sorted(os.listdir(tests_dir)):
+            if fname.endswith(".py"):
+                fpath = os.path.join(tests_dir, fname)
+                tests.append((f"tests/{fname}", fpath))
+    return tests
+
+
+def run_test(label, fpath, verbose=False):
+    """运行单个测试文件并返回结果。"""
+    if not os.path.isfile(fpath):
+        return {"label": label, "success": False, "reason": "文件不存在"}
+
+    start = time.time()
+    try:
+        result = subprocess.run(
+            [sys.executable, fpath],
+            capture_output=True,
+            text=True,
+            timeout=300,
+        )
+        elapsed = time.time() - start
+        success = result.returncode == 0
+
+        output = result.stdout
+        if result.stderr:
+            output += "\n--- stderr ---\n" + result.stderr
+
+        return {
+            "label": label,
+            "fpath": fpath,
+            "success": success,
+            "returncode": result.returncode,
+            "output": output,
+            "elapsed": elapsed,
+            "reason": None,
+        }
+    except subprocess.TimeoutExpired:
+        elapsed = time.time() - start
+        return {
+            "label": label,
+            "fpath": fpath,
+            "success": False,
+            "returncode": -1,
+            "output": "",
+            "elapsed": elapsed,
+            "reason": "TIMEOUT (>300s)",
+        }
+    except Exception as e:
+        return {
+            "label": label,
+            "fpath": fpath,
+            "success": False,
+            "returncode": -1,
+            "output": str(e),
+            "elapsed": 0,
+            "reason": f"EXCEPTION: {e}",
+        }
+
+
+def print_result(result, verbose=False):
+    """格式化输出测试结果。"""
+    status = "✅" if result["success"] else "❌"
+    elapsed_s = f"({result['elapsed']:.1f}s)" if result["elapsed"] else ""
+    print(f"  {status} {result['label']} {elapsed_s}")
+
+    if not result["success"]:
+        if result["reason"]:
+            print(f"     原因: {result['reason']}")
+        # 提取关键错误信息
+        output = result.get("output", "")
+        # 只显示最后几行错误
+        lines = output.splitlines()
+        error_lines = [l for l in lines if any(
+            kw in l.lower() for kw in ["error", "traceback", "fail", "assert"])
+        ]
+        if error_lines:
+            for l in error_lines[-5:]:
+                print(f"     {l}")
+        elif verbose and lines:
+            for l in lines[-10:]:
+                print(f"     {l}")
+
+    if verbose and result["success"]:
+        lines = result.get("output", "").splitlines()
+        for l in lines[-5:]:
+            if l.strip():
+                print(f"     {l}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Ninetoothed Correctness Test Runner (Cross-platform)",
+    )
+    parser.add_argument("target", nargs="?", default=None,
+                        help="要运行的测试名（如 softmax, elementwise_broadcast_add）")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                        help="详细输出")
+    parser.add_argument("--file", "-f", type=str, default=None,
+                        help="直接运行指定文件")
+    args = parser.parse_args()
+
+    skill_dir = get_skill_dir()
+
+    # 收集测试列表
+    all_tests = []
+    all_tests.extend(discover_example_tests(skill_dir))
+    all_tests.extend(discover_tests_dir_tests(skill_dir))
+
+    if not all_tests:
+        print("⚠️  未找到任何测试文件。")
+        sys.exit(0)
+
+    # 如果指定了 --file，只运行该文件
+    if args.file:
+        test_path = args.file
+        if not os.path.isabs(test_path):
+            test_path = os.path.join(skill_dir, test_path)
+        label = os.path.relpath(test_path, skill_dir)
+        all_tests = [(label, test_path)]
+
+    # 如果指定了 target 且不是 --file，匹配名称
+    if args.target and not args.file:
+        target_lower = args.target.lower()
+        matched = []
+        for label, fpath in all_tests:
+            if target_lower in label.lower():
+                matched.append((label, fpath))
+        if not matched:
+            print(f"⚠️  未找到匹配 '{args.target}' 的测试。可用测试:")
+            for label, _ in all_tests:
+                print(f"    - {label}")
+            sys.exit(1)
+        all_tests = matched
+
+    # 运行测试
+    print("=" * 56)
+    print("  Ninetoothed Correctness Test Runner")
+    print("=" * 56)
+    print(f"  Skill 目录: {skill_dir}")
+    print(f"  测试数量:   {len(all_tests)}")
+    if args.verbose:
+        for label, fpath in all_tests:
+            print(f"    - {label} ({fpath})")
+    print()
+
+    results = []
+    for label, fpath in all_tests:
+        print(f"  ▶ 正在运行: {label}")
+        result = run_test(label, fpath, verbose=args.verbose)
+        print_result(result, verbose=args.verbose)
+        results.append(result)
+        print()
+
+    # 汇总
+    passed = sum(1 for r in results if r["success"])
+    failed = sum(1 for r in results if not r["success"])
+    total = len(results)
+
+    print("=" * 56)
+    print(f"  结果: ✅ {passed} 通过 | ❌ {failed} 失败 | 共 {total}")
+    print("=" * 56)
+
+    if failed > 0:
+        print("\n失败项:")
+        for r in results:
+            if not r["success"]:
+                print(f"  ❌ {r['label']}")
+        print("\n💡 提示: 打开 references/failure_diagnosis.md 查找对应错误的修复方法。")
+        sys.exit(1)
+    else:
+        print("\n🎉 所有测试全部通过！")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/skills/competition/nineops-skills/scripts/run_correctness.sh b/skills/competition/nineops-skills/scripts/run_correctness.sh
new file mode 100644
index 0000000..3b72d3d
--- /dev/null
+++ b/skills/competition/nineops-skills/scripts/run_correctness.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+# run_correctness.sh — 运行 correctness 测试套件
+#
+# 用法:
+#   bash scripts/run_correctness.sh                    # 运行所有 correctness 测试
+#   bash scripts/run_correctness.sh test_broadcast_add # 运行特定测试
+#   bash scripts/run_correctness.sh --verbose          # 详细输出
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SKILL_DIR="$(dirname "$SCRIPT_DIR")"
+TESTS_DIR="$SKILL_DIR/tests"
+
+# 解析参数
+VERBOSE=""
+TARGET=""
+
+for arg in "$@"; do
+    case $arg in
+        --verbose)
+            VERBOSE="-v"
+            shift
+            ;;
+        *)
+            TARGET="$arg"
+            break
+            ;;
+    esac
+done
+
+echo "=========================================="
+echo "  Ninetoothed Correctness Test Runner"
+echo "=========================================="
+echo "SKILL_DIR:  $SKILL_DIR"
+echo "TARGET:     ${TARGET:-all}"
+echo ""
+
+# 如果指定了具体测试，尝试多种模式
+if [ -n "$TARGET" ]; then
+    if [ -f "$TESTS_DIR/${TARGET}.py" ]; then
+        echo "运行 ${TARGET}.py..."
+        python "$TESTS_DIR/${TARGET}.py"
+    elif [ -f "$TESTS_DIR/test_${TARGET}.py" ]; then
+        echo "运行 test_${TARGET}.py..."
+        python "$TESTS_DIR/test_${TARGET}.py"
+    elif [ -f "${TARGET}" ]; then
+        echo "运行 ${TARGET}..."
+        python "${TARGET}"
+    else
+        echo "错误: 找不到测试 '$TARGET'"
+        echo ""
+        echo "可用测试:"
+        for f in "$TESTS_DIR"/*.py; do
+            echo "  $(basename "$f" .py)"
+        done
+        exit 1
+    fi
+else
+    # 运行所有以 test_ 开头的文件
+    found=0
+    for test_file in "$TESTS_DIR"/test_*.py; do
+        if [ -f "$test_file" ]; then
+            echo "运行 $(basename "$test_file")..."
+            python $VERBOSE "$test_file"
+            echo ""
+            found=$((found + 1))
+        fi
+    done
+
+    if [ $found -eq 0 ]; then
+        echo "⚠️  未找到任何 test_*.py 文件。请先在 tests/ 下创建测试。"
+        echo ""
+        echo "运行 self-test tasks（描述性检查）:"
+        python -c "
+import os
+tasks_path = os.path.join('$TESTS_DIR', 'selftest_tasks.md')
+if os.path.exists(tasks_path):
+    with open(tasks_path, 'r') as f:
+        print(f.read())
+else:
+    print('selftest_tasks.md 不存在')
+"
+    else
+        echo "测试完成: $found 个文件执行。"
+    fi
+fi
+
+echo ""
+echo "=========================================="
+echo "  Done."
+echo "=========================================="
diff --git a/skills/competition/nineops-skills/templates/activation.py b/skills/competition/nineops-skills/templates/activation.py
new file mode 100644
index 0000000..6a91b1d
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/activation.py
@@ -0,0 +1,65 @@
+"""
+激活函数算子模板
+================
+适用于：silu, swiglu, gelu 等激活函数
+
+模式特征：
+- element-wise，沿最后一维分块，保留 strides（支持非连续张量）
+- 使用 ntl.sigmoid, ntl.cast 等 ntl 语言 API
+- 通常涉及类型提升到 float32 再计算
+- 使用 _element_wise_arrangement 通用布局替代 1D tile
+"""
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+# ============================================================
+# Step 1: 定义符号
+# ============================================================
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+
+# ============================================================
+# Step 2: 通用 arrangement（保留 strides，支持非连续张量）
+# ============================================================
+def _element_wise_arrangement(*tensors, block_size):
+    ndim = max(tensor.ndim for tensor in tensors)
+    assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors)
+    tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,)
+    return tuple(
+        tensor.tile(tile_shape) if tensor.ndim != 0 else tensor
+        for tensor in tensors
+    )
+
+
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return _element_wise_arrangement(input, output, block_size=BLOCK_SIZE)
+
+
+# ============================================================
+# Step 3: 定义 application
+# ============================================================
+def application(input, output):
+    input_loaded = input
+    # 类型提升到 float32 进行计算
+    #
+    # GELU approximate: 0.5 * x * (1 + tanh(0.79788 * (x + 0.044715 * x^3)))
+    #   x**3 → x*x*x, tanh → (exp(t)-exp(-t))/(exp(t)+exp(-t))
+    # GELU exact:      x * 0.5 * (1 + erf(x / sqrt(2)))
+    # Silu:            x * sigmoid(x)
+    # 注意：`**` 运算符不可用（Triton tensor 无 __pow__）
+    # 注意：模块级变量引用被 AST 原样嵌入 → NameError，用字面量
+    output = <FORMULA>  # noqa: F841
+
+
+# ============================================================
+# Step 4: 声明 Tensor 元信息（ndim 会展开为具体值）
+# ============================================================
+tensors = (Tensor(1), Tensor(1))
+
+
+# ============================================================
+# Step 5: 创建 kernel
+# ============================================================
+kernel = ninetoothed.make(arrangement, application, tensors)
diff --git a/skills/competition/nineops-skills/templates/benchmark_report_template.md b/skills/competition/nineops-skills/templates/benchmark_report_template.md
new file mode 100644
index 0000000..12d6b15
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/benchmark_report_template.md
@@ -0,0 +1,56 @@
+# Benchmark 报告模板
+
+> 填写 benchmark 测试的全部信息和结果。
+
+## 环境信息
+
+| 项目 | 值 |
+|------|-----|
+| GPU 型号 | <!-- 如 NVIDIA A100 80GB --> |
+| CUDA 版本 | <!-- 如 12.1 --> |
+| PyTorch 版本 | <!-- 如 2.4.0 --> |
+| Triton 版本 | <!-- 如 3.0.0 --> |
+| Ninetoothed 版本 | <!-- ninetoothed.__version__ --> |
+
+## 算子 Benchmark
+
+### 场景 1: 默认输入
+
+| Config | 本实现(ms) | PyTorch(ms) | Speedup | 说明 |
+|--------|-----------|-------------|---------|------|
+| (1024,) fp32 | | | | 小规模 |
+| (4096,) fp32 | | | | 中规模 |
+| (65536,) fp32 | | | | 大规模 |
+| (1024,) fp16 | | | | 半精度 |
+| (4096,) fp16 | | | | 中规模半精度 |
+
+### 场景 2: 广播场景
+
+| Broadcast | Shape A | Shape B | 本实现(ms) | PyTorch(ms) | Speedup |
+|-----------|---------|---------|-----------|-------------|---------|
+| scalar | (4096,) | (1,) | | | |
+| vector | (4096,) | (256,) | | | |
+| 3D | (4, 128, 256) | (256,) | | | |
+
+### 场景 3: 非连续访问
+
+| Variant | Shape | 本实现(ms) | PyTorch(ms) | Speedup |
+|---------|-------|-----------|-------------|---------|
+| contiguous | (1024, 1024) | | | |
+| transposed | (1024, 1024).T | | | |
+| sliced | x[::2, :] | | | |
+| expanded | (1, 1024).expand(1024, 1024) | | | |
+
+## 分析
+
+### Load/Store 模式
+
+<!-- 分析 load/store 的合并访问情况 -->
+
+### 瓶颈分析
+
+<!-- 是 compute-bound 还是 memory-bound？ -->
+
+## 总结
+
+<!-- benchmark 的总体结论和优化建议 -->
diff --git a/skills/competition/nineops-skills/templates/elementwise_1d.py b/skills/competition/nineops-skills/templates/elementwise_1d.py
new file mode 100644
index 0000000..1ef3c05
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/elementwise_1d.py
@@ -0,0 +1,88 @@
+"""
+Element-wise 算子模板
+======================
+适用于：add, mul, relu, gelu, silu 等逐元素操作
+
+模式特征：
+- 所有张量沿最后一维均匀分块，无跨块归约
+- 支持非连续张量（transpose、slice 等），保留原始 strides
+- BLOCK_SIZE 作为编译时常量传入
+- 推荐使用工厂函数模式（make_*）以支持动态 ndim
+
+用法示例：
+    add_kernel = make_add(ndim=2)
+    add_kernel(x, y, out, BLOCK_SIZE=1024)
+
+    relu_kernel = make_relu(ndim=1)
+    relu_kernel(x, out, BLOCK_SIZE=1024)
+
+# ⚠️ AST 跟踪约束（重要）
+# ============================================================
+# application() 内的 Python 代码会通过 AST 跟踪直接嵌入
+# 生成的 Triton 代码。Triton 的编译环境没有标准 Python 库，
+# 因此必须遵守以下规则：
+#
+# ❌ 禁止：math.*、torch.*、numpy.*
+# ❌ 禁止：模块级变量引用（原样嵌入 → NameError）
+# ❌ 禁止：** 运算符（Triton tensor 无 __pow__）
+# ✅ 允许：ntl.* 函数、字面量数值、四则运算
+# ✅ 推荐：x * x * x 代替 x ** 3
+# ✅ 推荐：0.7978845608028654 代替 math.sqrt(2.0 / math.pi)
+"""
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+
+# ============================================================
+# Step 1: 定义符号（编译时常量）
+# ============================================================
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+
+# ============================================================
+# Step 2: 通用 arrangement（数据布局）
+# ============================================================
+def _element_wise_arrangement(*tensors, block_size):
+    """通用 element-wise arrangement：保留 strides，支持非连续张量。
+
+    工作原理：
+    - 自动确定所有张量的最大 ndim
+    - 0 维张量（标量）原样传递，不做 tile
+    - 高维张量构造 tile_shape = (1, ..., 1, block_size)，
+      前 ndim-1 维为 1 不做 tile，只在最后一维分块。
+      这样高维张量的行/列 strides 被保留，Triton 能够
+      通过 ptr + row * stride_row + col * stride_col 正确寻址。
+
+    注意：标量广播需要通过 expand_as 创建 stride=0 视图调用方完成。
+    """
+    ndim = max(tensor.ndim for tensor in tensors)
+    assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors)
+
+    tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,)
+
+    return tuple(
+        tensor.tile(tile_shape) if tensor.ndim != 0 else tensor
+        for tensor in tensors
+    )
+
+
+# ============================================================
+# Step 3: 定义 application（计算逻辑）
+# ============================================================
+def application(input, output):
+    output = <FORMULA>  # noqa: F841
+
+
+# ============================================================
+# Step 4: 声明 Tensor 元信息
+# ============================================================
+# Tensor(1) 表示 1 维张量 这里的tensors相当于申请了两个一维张量
+tensors = (Tensor(1), Tensor(1))
+
+
+# ============================================================
+# Step 5: 创建 kernel
+# ============================================================
+kernel = ninetoothed.make(arrangement, application, tensors)
diff --git a/skills/competition/nineops-skills/templates/failure_diagnosis_template.md b/skills/competition/nineops-skills/templates/failure_diagnosis_template.md
new file mode 100644
index 0000000..4f89915
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/failure_diagnosis_template.md
@@ -0,0 +1,96 @@
+# 故障诊断模板
+
+> 在调试过程中按此模板记录问题，便于复现和分析。
+
+## 基本信息
+
+| 字段 | 值 |
+|------|-----|
+| 算子 | |
+| DSL 模式 | |
+| 首次发现日期 | |
+| 严重程度 | 高 / 中 / 低 |
+
+## 问题描述
+
+```
+请用1-2句话描述问题。
+```
+
+## 复现步骤
+
+### 最小复现代码
+
+```python
+# 用尽可能少的代码复现问题
+```
+
+### 输入
+
+- shape: 
+- dtype:
+- 是否 contiguous: 
+- broadcast 配置: 
+
+### 输出对比
+
+| 位置 | 预期值 | 实际值 |
+|------|--------|--------|
+| pos[0] | | |
+| pos[-1] | | |
+| mean | | |
+
+## 错误信息
+
+```
+粘贴完整错误堆栈。
+```
+
+## 诊断过程
+
+### 1. dtype 检查
+
+```python
+# 检查 dtype 链
+```
+
+### 2. border/mask 检查
+
+```python
+# 检查边界处理
+```
+
+### 3. Broadcast/Expand 检查
+
+```python
+# 打印中间 shape
+```
+
+### 4. Generated Source 检查
+
+```python
+# generate source 输出
+```
+
+## 根因分析
+
+<!-- 最终定位的根因 -->
+
+## 解决方案
+
+```python
+# 修复后的代码
+```
+
+## 验证
+
+| 测试 | 修复前 | 修复后 |
+|------|--------|--------|
+| dtype 测试 | ❌ | ✅ |
+| shape 测试 | ❌ | ✅ |
+| broadcast 测试 | ❌ | ✅ |
+| non-contiguous 测试 | ❌ | ✅ |
+
+## 防止再次发生
+
+<!-- 如何在流程上防止同类问题再次出现 -->
diff --git a/skills/competition/nineops-skills/templates/matmul_2d.py b/skills/competition/nineops-skills/templates/matmul_2d.py
new file mode 100644
index 0000000..91c0236
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/matmul_2d.py
@@ -0,0 +1,72 @@
+"""
+2D Matmul 算子模板
+==================
+适用于：mm, addmm 等矩阵乘法操作
+
+模式特征：
+- 使用 3 个 block_size 符号（M, N, K）由 autotune 自动搜索
+- output 按 (BLOCK_SIZE_M, BLOCK_SIZE_N) 分块
+- input 和 other 通过 tile + expand + squeeze 对齐到 output 分块
+- application 中使用循环 dot 累加
+"""
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Tensor, block_size
+
+# ============================================================
+# Step 1: 定义符号（meta 类型，由 autotune 自动搜索）
+# ============================================================
+BLOCK_SIZE_M = block_size()
+BLOCK_SIZE_N = block_size()
+BLOCK_SIZE_K = block_size()
+
+
+# ============================================================
+# Step 2: 定义 arrangement（数据布局）
+# ============================================================
+def arrangement(
+    input,
+    other,
+    output,
+    BLOCK_SIZE_M=BLOCK_SIZE_M,
+    BLOCK_SIZE_N=BLOCK_SIZE_N,
+    BLOCK_SIZE_K=BLOCK_SIZE_K,
+):
+    output_arranged = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N))
+
+    # input: (M, K) -> (BLOCK_SIZE_M, BLOCK_SIZE_K) -> tile(1, -1) -> expand(-1, N_blocks)
+    input_arranged = input.tile((BLOCK_SIZE_M, BLOCK_SIZE_K))
+    input_arranged = input_arranged.tile((1, -1))
+    input_arranged = input_arranged.expand((-1, output_arranged.shape[1]))
+    input_arranged.dtype = input_arranged.dtype.squeeze(0)
+
+    # other: (K, N) -> (BLOCK_SIZE_K, BLOCK_SIZE_N) -> tile(-1, 1) -> expand(M_blocks, -1)
+    other_arranged = other.tile((BLOCK_SIZE_K, BLOCK_SIZE_N))
+    other_arranged = other_arranged.tile((-1, 1))
+    other_arranged = other_arranged.expand((output_arranged.shape[0], -1))
+    other_arranged.dtype = other_arranged.dtype.squeeze(1)
+
+    return input_arranged, other_arranged, output_arranged
+
+
+# ============================================================
+# Step 3: 定义 application（计算逻辑）
+# ============================================================
+def application(input, other, output):
+    accumulator = ntl.zeros(output.shape, dtype=ntl.float32)
+    for k in range(input.shape[0]):
+        accumulator += ntl.dot(input[k], other[k])
+    output = accumulator
+
+
+# ============================================================
+# Step 4: 声明 Tensor 元信息
+# ============================================================
+tensors = (Tensor(2), Tensor(2), Tensor(2))
+
+
+# ============================================================
+# Step 5: 创建 kernel
+# ============================================================
+kernel = ninetoothed.make(arrangement, application, tensors)
diff --git a/skills/competition/nineops-skills/templates/operator_task_report_template.md b/skills/competition/nineops-skills/templates/operator_task_report_template.md
new file mode 100644
index 0000000..a4c04ff
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/operator_task_report_template.md
@@ -0,0 +1,70 @@
+# 算子任务报告模板
+
+> 本文档由 Agent 完成实现后填写，用于记录任务过程、结果和反思。
+
+## 基本信息
+
+| 字段 | 值 |
+|------|-----|
+| 算子 | <!-- 算子名称 --> |
+| DSL 模式 | <!-- 如 elementwise 1D / reduction 2D / matmul 2D --> |
+| 输入 shape | <!-- 如 (1024,), (768, 768) --> |
+| 输入 dtype | <!-- 如 float32, float16 --> |
+| BLOCK_SIZE | <!-- 如 1024, 512 --> |
+| 实现日期 | <!-- YYYY-MM-DD --> |
+
+## 实现过程
+
+### Step 1: 分析算子
+
+<!-- 描述算子的计算逻辑、输入/输出关系、是否存在广播、归约操作 -->
+
+### Step 2: 选择 DSL 模式
+
+<!-- 参照 dsl_patterns.md，选择对应的 arrangement 模板 -->
+
+### Step 3: 编写 Implementation
+
+**Arrangement 设计:**
+
+```python
+# 贴出 arrangement 代码
+```
+
+**Application 设计:**
+
+```python
+# 贴出 application 代码
+```
+
+### Step 4: Torch 包装
+
+```python
+# 贴出 warp 代码
+```
+
+## Correctness 测试结果
+
+| 测试场景 | Shape | Dtype | Broadcast | Contiguous | 结果 |
+|----------|-------|-------|-----------|------------|------|
+| 基础 | | | N/A | ✅ | ✅/❌ |
+| 广播 | | | ✅ | ✅ | ✅/❌ |
+| 非连续 | | | N/A | ❌ (transpose) | ✅/❌ |
+
+## Benchmark 结果
+
+| Shape | Dtype | BlockSize | 本实现(ms) | PyTorch(ms) | Speedup |
+|-------|-------|-----------|------------|-------------|---------|
+| | | | | | |
+
+## Generated Source 检查
+
+<!-- 关键发现，如果有问题则记录 -->
+
+## 遇到的问题
+
+1. <!-- 问题描述 --> → <!-- 解决方案 -->
+
+## 反思
+
+<!-- agent 自我反思：本次实现中哪些做法好，哪些可以改进 -->
diff --git a/skills/competition/nineops-skills/templates/reduction_2d.py b/skills/competition/nineops-skills/templates/reduction_2d.py
new file mode 100644
index 0000000..5812b3e
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/reduction_2d.py
@@ -0,0 +1,48 @@
+"""
+2D 行归约算子模板
+=================
+适用于：softmax, rms_norm, layer_norm 等
+  沿最后一维做归约的操作
+
+模式特征：
+- 保留第一维（batch），沿第二维做 tile
+- BLOCK_SIZE 通常取 input.shape[-1] 以覆盖整行
+- 涉及跨元素归约（sum, max 等）
+"""
+
+import ninetoothed
+import ninetoothed.language as ntl
+from ninetoothed import Symbol, Tensor
+
+# ============================================================
+# Step 1: 定义符号
+# ============================================================
+BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True)
+
+
+# ============================================================
+# Step 2: 定义 arrangement（数据布局）
+# ============================================================
+def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE):
+    return input.tile((1, BLOCK_SIZE)), output.tile((1, BLOCK_SIZE))
+
+
+# ============================================================
+# Step 3: 定义 application（计算逻辑）
+# ============================================================
+def application(input, output):
+    # 在第二维上做归约
+    output = <REDUCTION>  # noqa: F841
+
+
+# ============================================================
+# Step 4: 声明 Tensor 元信息
+# ============================================================
+# other=float("-inf") 用于给边界外的填充值（如 softmax mask）
+tensors = (Tensor(2, other=float("-inf")), Tensor(2))
+
+
+# ============================================================
+# Step 5: 创建 kernel
+# ============================================================
+kernel = ninetoothed.make(arrangement, application, tensors)
diff --git a/skills/competition/nineops-skills/templates/wrapper.py b/skills/competition/nineops-skills/templates/wrapper.py
new file mode 100644
index 0000000..815f916
--- /dev/null
+++ b/skills/competition/nineops-skills/templates/wrapper.py
@@ -0,0 +1,41 @@
+"""
+Torch 包装层模板
+================
+提供多种常见包装模式，用于将 kernel 接入 PyTorch 接口
+"""
+
+import torch
+
+
+def flatten_wrapper(kernel_module, input, BLOCK_SIZE=1024):
+    """
+    Flatten 模式：适用于 element-wise / 激活函数
+    将任意 shape 展平后调用 kernel，再恢复原形状
+    """
+    input_flat = input.flatten()
+    output_flat = torch.empty_like(input_flat)
+    kernel_module.kernel(input_flat, output_flat, BLOCK_SIZE=BLOCK_SIZE)
+    return output_flat.view_as(input)
+
+
+def direct_wrapper(kernel_module, *args, **kwargs):
+    """
+    直接模式：适用于 mm, bmm, attention 等
+    在包装层创建 output tensor，直接传参
+    """
+    output = torch.empty(output_shape, dtype=args[0].dtype, device=args[0].device)
+    kernel_module.kernel(*args, output, **kwargs)
+    return output
+
+
+def reshape_wrapper(kernel_module, input, *args, BLOCK_SIZE=None):
+    """
+    Reshape 模式：适用于 rms_norm 等需要 view(-1, last_dim) 的操作
+    """
+    original_shape = input.shape
+    input_2d = input.view(-1, original_shape[-1])
+    output_2d = torch.empty_like(input_2d)
+    if BLOCK_SIZE is None:
+        BLOCK_SIZE = original_shape[-1]
+    kernel_module.kernel(input_2d, *args, output_2d, BLOCK_SIZE=BLOCK_SIZE)
+    return output_2d.view(original_shape)
diff --git a/skills/competition/nineops-skills/tests/expected_outputs.md b/skills/competition/nineops-skills/tests/expected_outputs.md
new file mode 100644
index 0000000..28ef888
--- /dev/null
+++ b/skills/competition/nineops-skills/tests/expected_outputs.md
@@ -0,0 +1,94 @@
+# 期望输出参考
+
+以下为各示例在 NVIDIA GPU 上的典型正确性输出。Agent 应能验证输出与预期一致。
+
+## elementwise_broadcast_add
+
+**命令:** `python examples/elementwise_broadcast_add/run.py`
+
+```
+=== Elementwise Broadcast Add — 6 test cases ===
+  ✔ contiguous (256,) + (256,) → OK
+  ✔ scalar    (256,) + (1,)   → OK
+  ✔ broadcast (4,8) + (8,)   → OK
+  ✔ broadcast (4,8) + (4,1)   → OK
+  ✔ transposed (32,16) contig → OK
+  ✔ uneven    (1000,) + (1,)  → OK
+All 6 tests passed!
+```
+
+## reduction_softmax
+
+**命令:** `python examples/reduction_softmax/run.py`
+
+```
+=== Reduction Softmax — 8 test cases ===
+  ✔ basic (4,1024)           → OK
+  ✔ multi-row (8,4096)       → OK
+  ✔ uneven cols (4,768)      → OK
+  ✔ single row (1,2048)      → OK
+  ✔ fp16 (4,1024)            → OK
+  ✔ extreme (4,1024) large   → OK
+  ✔ non-contiguous (4,1024)  → OK
+  ✔ prime cols (4,1021)      → OK
+All 8 tests passed!
+```
+
+## non_contiguous_stride_case
+
+**命令:** `python examples/non_contiguous_stride_case/run.py`
+
+```
+=== Non-Contiguous Stride Tests (10 scenarios) ===
+  ✔ contiguous (128,)           → OK
+  ✔ transposed (32,64) contig   → OK
+  ✔ sliced rows (16,64) view    → OK
+  ✔ sliced cols (64,16) view    → OK
+  ✔ sliced both (32,32) view    → OK
+  ✔ view3d (8,8,8) -> (64,)     → OK
+  ✔ expanded (1,128) -> (4,128) → OK
+  ✔ permuted (4,32,8) dims      → OK
+  ✔ as_strided (32,64)          → OK
+  ✔ small 1D (15,)              → OK
+All 10 tests passed!
+```
+
+## performance_regression_case
+
+**命令:** `python examples/performance_regression_case/run.py`
+
+```
+=== Performance Regression — Matmul 2D 示例 ===
+  ✅ BLOCK=16    — M=1024, N=1024, K=1024
+  ✅ BLOCK=32    — M=1024, N=1024, K=1024
+  ✅ BLOCK=64x64x32 — M=1024, N=1024, K=1024
+  ✅ BLOCK=128x128x32 — M=1024, N=1024, K=1024
+  ... (256x512x128 同理)
+🎉 所有 matmul 变体正确性验证通过！
+```
+
+**Benchmark 预期趋势:**
+
+```
+shape      | BLOCK=16  | BLOCK=128  | 退化倍数
+(512,512,512)| ~2.0ms   | ~0.3ms     | ~6x
+(1024,...)   | ~12ms    | ~2.0ms     | ~6x
+(2048,...)   | ~90ms    | ~15ms      | ~6x
+```
+
+> 具体数值因 GPU 型号不同会有较大差异，但 **退化倍数 >2x** 是可靠的诊断指标。
+
+## 通用断言 (Agent Validator)
+
+在任何示例上验证时，使用以下逻辑：
+
+```python
+import torch
+
+def assert_correct(kernel_output: torch.Tensor, expected: torch.Tensor, atol=1e-3):
+    """确认 kernel 输出与 PyTorch 基准一致。"""
+    assert torch.allclose(kernel_output, expected, atol=atol, rtol=1e-3), \
+        f"max diff={torch.max(torch.abs(kernel_output - expected)).item():.6f}"
+```
+
+如果该断言通过，则 agent 可确认实现正确。
diff --git a/skills/competition/nineops-skills/tests/selftest_tasks.md b/skills/competition/nineops-skills/tests/selftest_tasks.md
new file mode 100644
index 0000000..a7bd12c
--- /dev/null
+++ b/skills/competition/nineops-skills/tests/selftest_tasks.md
@@ -0,0 +1,59 @@
+# Selftest Tasks — Agent 自我校验任务
+
+这些任务用来验证 `.skill` 工作区的完整性和 agent 能力。每个任务应能在 **不访问外部网络** 的情况下，仅基于 `.skill` 内部文档完成。
+
+## 任务 1: 实现一个 elementwise 加法 kernel
+
+- 打开 [dsl_patterns.md](../references/dsl_patterns.md) 找到 elementwise_1d 模式
+- 使用 `Tensor` + `Symbol` + 1D arrangement + `application(ntl.add)` + `make`
+- 验证: 输入 a(T), b(T) → c(T)，结果应与 a+b 一致
+- 参考已有示例: [elementwise_broadcast_add](../examples/elementwise_broadcast_add/run.py)
+- 预期耗时: 手动实现 ≤15 min
+
+## 任务 2: 测试覆盖度检查
+
+- 打开 [testing_patterns.md](../references/testing_patterns.md) 检查 4 个维度 (dtype, shape, broadcast, stride)
+- 对任务 1 实现的加法 kernel，编写 pytest 参数化测试覆盖至少 6 个组合
+- 验证: pytest --verbose 全部通过
+- 预期耗时: ≤20 min
+
+## 任务 3: AOT 编译 / 查看生成源码
+
+- 对任务1的加法 kernel 调用 scripts/inspect_generated_source.sh 查看生成 Triton IR
+- 确认 kernel name、参数签名、loop structure 与预期一致
+- 预期耗时: ≤10 min
+
+## 任务 4: 性能 Benchmark 分析
+
+- 对任务1的加法 kernel，编写 benchmark 对比输入规模: (1024,), (4096,), (16384,)
+- 使用 scripts/run_benchmark.sh 执行并保存结果
+- 观察: 是否存在某个规模下 kernel 慢于 PyTorch（当数据量小或带宽受限时可能出现）
+- 预期耗时: ≤20 min
+
+## 任务 5: 识别一个性能退化
+
+- 打开 [performance_regression_case](../examples/performance_regression_case/) 的 benchmark.py
+- 运行 `python examples/performance_regression_case/benchmark.py`
+- 观察: BLOCK_SIZE=16 比 BLOCK_SIZE=128 慢多少倍
+- 预期耗时: ≤30 min (含编译)
+
+## 任务 6: 非连续 stride 处理
+
+- 打开 [non_contiguous_stride_case](../examples/non_contiguous_stride_case/run.py)
+- 理解 10 种 stride 变体的测试方法
+- 对任务1的加法 kernel 增加非连续测试
+- 预期耗时: ≤15 min
+
+## 任务 7: 使用模板报告
+
+- 打开 [operator_task_report_template.md](../templates/operator_task_report_template.md)
+- 填写任务1～4的完整报告
+- 预期耗时: ≤10 min
+
+## 任务 8: 故障注入诊断
+
+- 人为修改加法 kernel 的 BLOCK_SIZE 为 0
+- 运行新 kernel，观察报错信息
+- 打开 [failure_diagnosis.md](../references/failure_diagnosis.md) 按诊断流程解决
+- 使用 [failure_diagnosis_template.md](../templates/failure_diagnosis_template.md) 记录故障
+- 预期耗时: ≤10 min
diff --git a/skills/competition/nineops-skills/tests/trigger_prompts.md b/skills/competition/nineops-skills/tests/trigger_prompts.md
new file mode 100644
index 0000000..8b2bdae
--- /dev/null
+++ b/skills/competition/nineops-skills/tests/trigger_prompts.md
@@ -0,0 +1,24 @@
+# Agent 触发 Prompt
+
+当用户提出以下类型的请求时，请引导其使用 `.skill` 工作区中的已有资源。
+
+## 场景 → 触发 → 动作
+
+| 场景 | 关键词 | 动作 |
+|------|--------|------|
+| 实现新算子 | "实现"、"写个 kernel"、"算子" | → 参考 [dsl_patterns.md](../references/dsl_patterns.md) + 选取模板 [operator_task_report_template.md](../templates/operator_task_report_template.md) |
+| 跑正确性测试 | "测试"、"验证"、"run" | → 参考 [testing_patterns.md](../references/testing_patterns.md) + 使用 scripts/run_correctness.sh |
+| 跑性能 benchmark | "benchmark"、"性能"、"速度"、"加速比" | → 参考 [benchmark_patterns.md](../references/benchmark_patterns.md) + 使用 scripts/run_benchmark.sh |
+| 检查生成源码 | "生成的代码"、"triton代码"、"source"、"codegen" | → 参考 [generated_source_and_aot.md](../references/generated_source_and_aot.md) + 使用 scripts/inspect_generated_source.sh |
+| 失败诊断 | "报错"、"失败"、"崩溃"、"error"、"错误" | → 参考 [failure_diagnosis.md](../references/failure_diagnosis.md) + 使用模板 [failure_diagnosis_template.md](../templates/failure_diagnosis_template.md) |
+| 项目总体理解 | "这个 skill 是干什么的"、"怎么用" | → 阅读 [../README.md](../README.md) (根 README) |
+| repo 结构 | "仓库结构"、"ninetoothed 源码" | → 参考 [repo_index.md](../references/repo_index.md) |
+| 数据分析 (已跑完) | "分析结果"、"汇总"、"报告" | → 使用 scripts/collect_task_log.py 收集日志 |
+| 任务完成 | "完成"、"结束了"、"可以了" | → 使用模板 [operator_task_report_template.md](../templates/operator_task_report_template.md) 生成最终报告 |
+
+## 提示词短句
+
+- "参考 dsl_patterns 实现一个 elementwise 算子" → 打开 references/dsl_patterns.md 找到 elementwise_1d 模式
+- "用 run_correctness.sh 跑测试" → 执行 scripts/run_correctness.sh
+- "跑 benchmark 看看速度" → 执行 scripts/run_benchmark.sh
+- "排查编译错误" → 打开 references/failure_diagnosis.md 按分类排查