diff --git a/skills/competition/nineops-skills/.vscode/settings.json b/skills/competition/nineops-skills/.vscode/settings.json new file mode 100644 index 0000000..9fed558 --- /dev/null +++ b/skills/competition/nineops-skills/.vscode/settings.json @@ -0,0 +1,58 @@ +{ + "chat.advanced.cli.mcp.enabled": true, + "chat.cli.mcp.enabled": true, + "chat.mcp.access": "all", + "chat.tools.terminal.autoApprove": { + "/^bash\\b/": true, + "/^cat\\b/": true, + "/^cd\\b/": true, + "/^chmod\\b/": true, + "/^cp\\b/": true, + "/^curl\\b/": true, + "/^diff\\b/": true, + "/^echo\\b/": true, + "/^find\\b/": true, + "/^git\\b/": true, + "/^go\\b/": true, + "/^grep\\b/": true, + "/^head\\b/": true, + "/^ls\\b/": true, + "/^mkdir\\b/": true, + "/^mv\\b/": true, + "/^node\\b/": true, + "/^npm\\b/": true, + "/^pip/": true, + "/^printf\\b/": true, + "/^pwd\\b/": true, + "/^python/": true, + "/^rm\\b/": true, + "/^sed\\b/": true, + "/^sh\\b/": true, + "/^sort\\b/": true, + "/^tail\\b/": true, + "/^tee\\b/": true, + "/^test\\b/": true, + "/^touch\\b/": true, + "/^wc\\b/": true, + "/^wget\\b/": true, + "/^which\\b/": true + }, + "chat.tools.terminal.blockDetectedFileWrites": false, + "chat.tools.terminal.ignoreDefaultAutoApproveRules": true, + "github.copilot.chat.additionalReadAccessPaths": [ + "C:\\", + "D:\\", + "E:\\", + "F:\\" + ], + "github.copilot.enable": { + "*": false, + "markdown": false, + "plaintext": false + }, + "github.copilot.nextEditSuggestions.enabled": false, + "github.copilot.nextEditSuggestions.fixes": false, + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] +} \ No newline at end of file diff --git a/skills/competition/nineops-skills/.vscode/settings.json.copilot-hub-bak b/skills/competition/nineops-skills/.vscode/settings.json.copilot-hub-bak new file mode 100644 index 0000000..c68c115 --- /dev/null +++ b/skills/competition/nineops-skills/.vscode/settings.json.copilot-hub-bak @@ -0,0 +1,5 @@ +{ + "python-envs.defaultEnvManager": "ms-python.python:conda", + "python-envs.defaultPackageManager": "ms-python.python:conda", + "python-envs.pythonProjects": [] +} \ No newline at end of file diff --git a/skills/competition/nineops-skills/CLAUDE.md b/skills/competition/nineops-skills/CLAUDE.md new file mode 100644 index 0000000..482c400 --- /dev/null +++ b/skills/competition/nineops-skills/CLAUDE.md @@ -0,0 +1,50 @@ +# Nineteethed DSL 算子开发 — Agent 快速参考 + +> **工作流指引**:收到算子开发任务时,先通读 `skill/SKILL.md §0 工作流总览`,按「开发→测试→诊断→修复」四阶段执行。 +> 关键模板在 `skill/templates/`,故障排查参照 `skill/references/failure_diagnosis.md`。 + +## 核心经验总结(来自 Add / ReLU / GELU 实现) + +### AST 跟踪陷阱(最重要的坑) +application() 内的代码会被 AST 跟踪原样嵌入生成 Triton 代码,Triton 环境没有标准 Python 库。 +- **禁止** `math.*`、`torch.*`、`numpy.*` → 用字面量数值 +- **禁止**模块级变量引用(变量名被原样嵌入导致 NameError) +- **禁止** `**` 运算符(Triton tensor 无 `__pow__`) → `x * x * x` +- **禁止** `ntl.tanh`(不存在) → `(exp(t)-exp(-t))/(exp(t)+exp(-t))` +- **允许** `ntl.*` 函数、字面量数值、四则运算 + +### 非连续张量支持(关键修复) +- **不要** `flatten()` → 破坏 strides,转置张量写入错位 +- **要** `tile(tuple(1 for _ in range(ndim-1)) + (block_size,))` → 保留 strides +- `Tensor(ndim)` 的 ndim 必须与实际张量维度一致 + +### Element-wise 通用 arrangement 模式 +```python +def _element_wise_arrangement(*tensors, block_size): + ndim = max(tensor.ndim for tensor in tensors) + assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors) + tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,) + return tuple( + tensor.tile(tile_shape) if tensor.ndim != 0 else tensor + for tensor in tensors + ) +``` + +### GELU 实现要点 +- **近似版**: `0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))` + - `sqrt(2/pi)` = `0.7978845608028654`(字面量) + - `x^3` = `x * x * x` + - `tanh` = 手动用 `ntl.exp` + - 测试: `torch.nn.functional.gelu(x, approximate='tanh')` +- **精确版**: `x * 0.5 * (1 + erf(x / sqrt(2)))` + - 使用 `ntl.erf`, `ntl.sqrt` + - 测试: `torch.nn.functional.gelu(x)` + +### 数据类型支持 +- fp32: atol=1e-5, rtol=1e-5 +- fp16: atol=1e-3, rtol=1e-3 (注意精度损失) +- bf16: 类似 fp16 + +### 广播操作 +- 通过 `expand_as` 创建 stride=0 视图 +- Triton 自动处理 HBM 广播 diff --git a/skills/competition/nineops-skills/README.md b/skills/competition/nineops-skills/README.md new file mode 100644 index 0000000..4f8d3a3 --- /dev/null +++ b/skills/competition/nineops-skills/README.md @@ -0,0 +1,131 @@ +# .skill — ninetoothed DSL Agent Workspace + +> 本 `.skill` 工作区是 AI Agent 的 **技能包 (Skill Package)**,使 agent 能够高效实现、测试、基准分析和诊断基于 **ninetoothed DSL** 的 GPU 算子。所有文档、模板、脚本、示例均已内置,agent 可自主完成从实作到报告的全流程。 + +## 概览 + +| 目录 | 用途 | +|------|------| +| `references/` | DSL 模式、测试模式、Benchmark 模式、Repo 索引、AOT 指南、故障诊断 | +| `scripts/` | 正确性测试、Benchmark、源码检查、日志收集的可执行脚本 | +| `templates/` | Agent 任务报告、Benchmark 报告、故障诊断报告模板 | +| `examples/` | 4 个完整示例项目(含源码 + benchmark) | +| `tests/` | Agent 触发 prompt、自校验任务、期望输出参考 | + +## 快速开始 + +### 1. 实现一个算子 + +```bash +# 1a. 参考 DSL 模式 +cat references/dsl_patterns.md + +# 1b. 参考已有示例(如 elementwise 加法) +cat examples/elementwise_broadcast_add/run.py + +# 1c. 实现自己的 kernel +``` + +### 2. 运行正确性测试 + +```bash +scripts/run_correctness.sh examples/elementwise_broadcast_add +``` + +### 3. 运行 Benchmark + +```bash +scripts/run_benchmark.sh examples/elementwise_broadcast_add +``` + +### 4. 查看生成源 + +```bash +scripts/inspect_generated_source.sh examples/elementwise_broadcast_add/run.py +``` + +### 5. 收集日志 + +```bash +python scripts/collect_task_log.py --dir . --output task_log.json +``` + +## 文件结构 + +``` +.skill/ +├── README.md ← 本文档 +├── SKILL.md ← DSL 完整 API 参考 +├── references/ +│ ├── repo_index.md ← ninetoothed 仓库结构索引 +│ ├── dsl_patterns.md ← 7 种 DSL 实现模式 +│ ├── testing_patterns.md ← 4 维度测试覆盖策略 +│ ├── benchmark_patterns.md ← 8 元素 Benchmark 设计 +│ ├── generated_source_and_aot.md ← Codegen 查看 + AOT 编译 +│ └── failure_diagnosis.md ← 4 类故障诊断指南 +├── scripts/ +│ ├── validate_skill_package.py ← 结构完整性检查 +│ ├── run_correctness.sh ← 正确性测试运行器 +│ ├── run_benchmark.sh ← Benchmark 运行器 +│ ├── inspect_generated_source.sh ← 生成源码查看器 +│ └── collect_task_log.py ← 任务日志收集器 +├── templates/ +│ ├── operator_task_report_template.md ← 算子任务报告模板 +│ ├── benchmark_report_template.md ← Benchmark 报告模板 +│ └── failure_diagnosis_template.md ← 故障诊断模板 +├── examples/ +│ ├── elementwise_broadcast_add/ ← 加法 kernel (elementwise_1d) +│ ├── reduction_softmax/ ← Softmax kernel (reduction_2d) +│ ├── non_contiguous_stride_case/ ← 非连续 stride 测试 +│ └── performance_regression_case/ ← BLOCK_SIZE 退化诊断 +└── tests/ + ├── trigger_prompts.md ← Agent 触发 prompt + ├── selftest_tasks.md ← 自我校验任务 + └── expected_outputs.md ← 期望输出参考 +``` + +## Agent 工作流程 + +当 agent 收到"实现一个 XX 算子"的请求时,典型工作流如下: + +``` +1. 理解需求 ──→ 打开 references/dsl_patterns.md,匹配模式 + │ +2. 查看模板 ──→ 打开 templates/operator_task_report_template.md + │ +3. 参考示例 ──→ 查看 examples/ 下相同模式的实现 + │ +4. 实现代码 ──→ 编写 kernel.py + run.py + benchmark.py + │ +5. 正确性测试 ──→ scripts/run_correctness.sh 验证 + │ +6. Benchmark ──→ scripts/run_benchmark.sh 性能对比 + │ +7. 查看源码 ──→ scripts/inspect_generated_source.sh 检查 + │ +8. 故障诊断 ──→ (如遇错误) 参考 failure_diagnosis.md + │ +9. 生成报告 ──→ 填写 operator_task_report_template 完成 +``` + +## 环境要求 + +- Python 3.10+ +- PyTorch 2.0+ (CUDA) +- ninetoothed (git@github.com:QuantumIntelligence/ninetoothed.git) +- NVIDIA GPU with CUDA support + +## 结构校验 + +```bash +python scripts/validate_skill_package.py +``` + +预期输出: +``` +✅ .skill structure OK (所有 5 个目录和核心文件均存在) +``` + +## License + +Internal — Qiyuan Competition diff --git a/skills/competition/nineops-skills/SKILL.md b/skills/competition/nineops-skills/SKILL.md new file mode 100644 index 0000000..52afe64 --- /dev/null +++ b/skills/competition/nineops-skills/SKILL.md @@ -0,0 +1,1027 @@ +--- +name: skill +description: 使用 ninetoothed DSL 开发 Triton 算子的完整指南,涵盖开发→测试→诊断→修复全流程。 +--- + +--- + +## 0. 工作流总览(开发 → 测试 → 诊断 → 修复) + +这是使用本 skill 包开发算子的完整生命周期。当收到开发算子的任务时,按以下四个阶段依次执行。 + +--- + +### Phase 1:开发算子 + +**步骤**: +1. **理解算子** — 分析算子的计算逻辑(element-wise / reduction / matmul / norm / attention 等)、输入输出关系、广播/归约操作 +2. **选择 DSL 模式** — 对照下文 §4 Arrangement 模式和 §5 Application 模式,确定对应模板 +3. **复用模板** — 从 `skill/templates/` 选择最接近的模板文件,复制到目标文件 +4. **编写 arrangement** — 根据模式定义数据布局(tile/expand/squeeze 等) +5. **编写 application** — 使用 `ntl.*` API 编写块内计算逻辑(⚠️ 注意 AST 跟踪陷阱,见 §8) +6. **声明 Tensor 与 Symbol** — `Tensor(ndim)` 的 ndim 与实际张量维度一致 +7. **创建 kernel** — 调用 `ninetoothed.make(arrangement, application, tensors)` +8. **编写 torch 包装层** — 参考 §7 的模式,建议使用 flatten_wrapper(非连续安全) + +**参考材料**: +- 模式对照:`skill/references/dsl_patterns.md` +- 模板文件:`skill/templates/elementwise_1d.py`、`skill/templates/activation.py` 等 + +--- + +### Phase 2:测试算子 + +**编写测试**:在算子所在文件中添加测试套件,覆盖以下场景: + +| 维度 | 场景 | 检查点 | +|------|------|--------| +| 基础正确性 | contiguous 张量,标准 shape | 与 `torch.*` 或 `F.*` 输出 allclose | +| 半精度 | fp16 / bf16 输入 | atol=1e-3, rtol=1e-3 | +| 广播 | expand_as 创建 stride=0 视图 | 正确广播 | +| 非连续 | `.t()` 转置、`[::2]` 切片 | 数值正确,不 crash | + +**运行测试(Linux)**: + +```bash +# Option A:直接运行(推荐开发阶段使用) +python path/to/your_operator_file.py + +# Option B:使用 test 脚本 +bash skill/scripts/run_correctness.sh -f path/to/your_file.py + +# Option C:运行 skill 全部测试 +bash skill/scripts/run_correctness.sh +``` + +**参考材料**: +- 测试模式:`skill/references/testing_patterns.md` +- 验证脚本:`skill/scripts/validate_skill_package.py` + +--- + +### Phase 3:诊断失败 + +当测试报错或结果不正确时,按以下顺序排查: + +**1. 编译错误(Crash / CUDA Error / NameError)** + +常见症状与速查: + +| 报错信息 | 最可能原因 | 解决方案 | +|----------|-----------|---------| +| `NameError: name 'math' is not defined` | application 中引用了 `math.*` | 用字面量数值替代 | +| `NameError: name 'XXX' is not defined` | 模块级变量被 AST 跟踪 | 将常量值内联到 application | +| `AttributeError: __pow__` | 使用了 `**` 运算符 | 用 `x * x * x` 替代 | +| `AttributeError: tanh` | 调用了 `ntl.tanh` | 用 `ntl.exp` 手动实现 | +| `make()` 编译失败 | Tensor 声明/符号不匹配 | 检查 Tensor ndim 和 Symbol 传递 | + +**2. 正确性失败(数值不匹配)** + +- **检查 dtype** — 是否需要 `ntl.cast` 到 float32 计算? +- **检查 BLOCK_SIZE** — 是否整除总元素数?不能整除时需要 mask +- **检查非连续张量** — arrangement 是否用了 `flatten()`?→ 改为 preserve-ndim tile +- **检查广播** — 广播维度是否正确 expand? + +**3. 诊断工具** + +```bash +# 查看生成的 Triton 源码(用于诊断 AST 嵌入问题) +bash skill/scripts/inspect_generated_source.sh your_operator_file.py + +# 查看日志 +python skill/scripts/collect_task_log.py --output diagnose_log/ +``` + +**参考材料**: +- 故障诊断指南:`skill/references/failure_diagnosis.md` +- 诊断模板:`skill/templates/failure_diagnosis_template.md` +- 生成源码检查:`skill/references/generated_source_and_aot.md` + +--- + +### Phase 4:修复并回归 + +1. 根据诊断结果修改代码 +2. **优先使用编辑工具** 进行定点修改(`replace_string_in_file` / `multi_replace_string_in_file`) +3. 重新运行测试(回到 Phase 2) +4. 确认全部测试通过后,填写算子任务报告: + +```bash +# 参考模板记录实现过程 +cat skill/templates/operator_task_report_template.md +``` + +--- + +### 全流程示例(以 ReLU 为例) + +```bash +# 1. 开发 — 在 basic_operators.py 中实现 make_relu() +# 参考 skill/templates/elementwise_1d.py 模板 + +# 2. 测试 — 运行测试 +python basic_operators.py +# 输出: ✅ ReLU 测试通过 + +# 3. 诊断(失败时) +# 报错: AssertionError: ReLU non-contiguous failed +# 根因: arrangement 用 flatten() 破坏了 strides +# 修复: tile_shape = (1,)*(ndim-1) + (block_size,),使用 preserve-ndim tile + +# 4. 回归 +python basic_operators.py +# 输出: 🎉 所有测试全部通过! +``` + +--- + +## 1. 核心概念 + +**注意**引入常用的九齿python包 +```python +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor, block_size + +``` + +### 1.1 DSL 三层架构 + +每个 ninetoothed 算子由三个部分组成: + +``` +┌─────────────────────────────────────────┐ +│ torch 包装层 │ +│ (预处理/后处理、shape 调整、调用 kernel) │ +├─────────────────────────────────────────┤ +│ arrangement 函数 │ +│ (数据布局、分块 tile、维度变换) │ +├─────────────────────────────────────────┤ +│ application 函数 │ +│ (逐块计算逻辑,使用 ntl 语言) │ +└─────────────────────────────────────────┘ +``` + +- **torch 包装层**:负责创建 output tensor、reshape/flatten、调用 `kernel()` 并传参 +- **arrangement**:描述每个 Tensor 如何被分成 tile 块,以及块间维度关系 +- **application**:定义每个 tile 内的计算逻辑(对标 Triton kernel 体) + +### 1.2 基本流程 + +```python +import ninetoothed +from ninetoothed import Symbol, Tensor + +# 1. 定义符号(block 大小等) +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + +# 2. 定义数据布局 +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.tile((BLOCK_SIZE,)), output.tile((BLOCK_SIZE,)) + +# 3. 定义计算逻辑 +def application(input, output): + output = input * 2 # noqa: F841 + +# 4. 声明 tensor 元信息 +tensors = (Tensor(1), Tensor(1)) + +# 5. 创建 kernel +kernel = ninetoothed.make(arrangement, application, tensors) +``` + +--- + +## 2. Symbol —— 符号参数系统 + +### 2.1 符号类型 + +```python +from ninetoothed import Symbol, block_size + +# constexpr —— 编译时常量,在 make() 时确定 +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + +# meta —— 元参数,Triton autotune 可自动搜索 +BLOCK_SIZE_M = block_size() # 等价于 Symbol("BLOCK_SIZE_M", meta=True) + +# constexpr + upper_bound —— 常量且有上界,用于窗口/卷积 +WINDOW_HEIGHT = Symbol("WINDOW_HEIGHT", constexpr=True, upper_bound=16) +WINDOW_WIDTH = Symbol("WINDOW_WIDTH", constexpr=True, upper_bound=16) + +# constexpr + upper_bound —— 用于 scale 等标量 +SCALE = Symbol("SCALE", constexpr=True, upper_bound=128) +``` + +### 2.2 符号传递方式 + +```python +# 方式1:在 kernel() 调用时传参(适用于 constexpr) +kernel(input, other, output, BLOCK_SIZE=1024) + +# 方式2:由 autotune 自动搜索(适用于 meta/block_size) +# 无需手动传值 +``` + +### 2.3 符号命名约定 + +| 符号名 | 用途 | 推荐类型 | +|--------|------|----------| +| `BLOCK_SIZE` | 1D 通用 tile 大小 | `constexpr=True` | +| `BLOCK_SIZE_M` | 矩阵 M 维度 tile | `block_size()` | +| `BLOCK_SIZE_N` | 矩阵 N 维度 tile | `block_size()` | +| `BLOCK_SIZE_K` | 矩阵 K 维度 tile | `block_size()` | +| `WINDOW_HEIGHT` | 池化/卷积窗口高度 | `constexpr=True, upper_bound=N` | +| `WINDOW_WIDTH` | 池化/卷积窗口宽度 | `constexpr=True, upper_bound=N` | + +--- + +## 3. Tensor —— 张量元信息声明 + +### 3.1 Tensor 构造函数 + +```python +Tensor(ndim, other=, shape_options=) +``` + +| 参数 | 说明 | +|------|------| +| `ndim` | 张量维度数(int) | +| `other=float("-inf")` | 边界填充值(用于 max 或 softmax 等归约操作) | +| `shape_options=None / dict` | 形状选项,如 `{"constexpr": True}` 或 `{"constexpr": True, "upper_bound": 128}` | + +### 3.2 常见 Tensor 声明 + +```python +# 标量 +Tensor(0) # 0 维张量(eps, beta, alpha, scale) + +# 1D 张量 +Tensor(1) # 1 维,用于 element-wise + +# 2D 张量 +Tensor(2) # 2 维矩阵 +Tensor(2, other=float("-inf")) # 2D + 负无穷边界(softmax) + +# 3D 张量 +Tensor(3) # 3D(bmm 等) + +# 4D 张量 +Tensor(4) # 4D(conv2d, rope, attention) +Tensor(4, shape_options={"constexpr": True}) # 部分维度 constexpr + +# 批量 Tensor 声明 +tuple(Tensor(4, shape_options=shape_options) for _ in range(3)) +``` + +### 3.3 Tensor 元组规范 + +`tensors` 元组的顺序必须与 `arrangement` 和 `application` 的参数顺序**完全一致**。 + +```python +# add: 3 个 1D tensor +tensors = tuple(Tensor(1) for _ in range(3)) + +# softmax: 2D input (边界 -inf) + 2D output +tensors = (Tensor(2, other=float("-inf")), Tensor(2)) + +# mm: 3 个 2D tensor +tensors = (Tensor(2), Tensor(2), Tensor(2)) + +# fused_rms_norm: 两个 2D + 一个标量 + 一个 2D +tensors = (Tensor(2), Tensor(2), Tensor(0), Tensor(2)) +``` + +--- + +## 4. Arrangement —— 数据布局模式大全 + +### 4.1 基础 tile 模式 + +#### 模式 A:简单 1D tile(add, silu, swiglu) + +所有参与张量沿最后一维均匀分块: + +```python +def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE): + input_arranged = input.tile((BLOCK_SIZE,)) + other_arranged = other.tile((BLOCK_SIZE,)) + output_arranged = output.tile((BLOCK_SIZE,)) + return input_arranged, other_arranged, output_arranged +``` + +#### 模式 A2:非连续安全的 Multi-ND tile(推荐用于 element-wise 算子) + +和模式 A 不同,本模式**不进行 flatten**,保留原始 strides,确保 `.T` / `.t()` 等非连续张量的读写正确: + +```python +def _element_wise_arrangement(*tensors, block_size): + ndim = max(tensor.ndim for tensor in tensors) + assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors) + tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,) + return tuple( + tensor.tile(tile_shape) if tensor.ndim != 0 else tensor + for tensor in tensors + ) +``` + +| 区别 | 模式 A(flatten) | 模式 A2(非 flatten) | +|------|-------------------|----------------------| +| 张量声明 | `Tensor(1)` | `Tensor(ndim)`(与实际 ndim 一致) | +| 非连续支持 | ❌ flatten 破坏 strides | ✅ 保留原始 strides | +| 标量支持 | 需额外处理 | ✅ `tensor.ndim != 0` 自动跳过 | + +#### 模式 B:2D 行 tile(softmax, rms_norm) + +保留第一维(batch),分块第二维: + +```python +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.tile((1, BLOCK_SIZE)), output.tile((1, BLOCK_SIZE)) +``` + +#### 模式 C:2D 分块 Matmul(mm) + +核心技巧:先将 output 按 `(BLOCK_SIZE_M, BLOCK_SIZE_N)` 分块,再让 input 和 other 通过 `tile + expand + squeeze` 与其对齐。 + +```python +def arrangement(input, other, output, + BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, + BLOCK_SIZE_K=BLOCK_SIZE_K): + + output_arranged = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N)) + + # input: (M, K) -> (BLOCK_SIZE_M, BLOCK_SIZE_K) -> tile(1, -1) -> expand(-1, N_blocks) + input_arranged = input.tile((BLOCK_SIZE_M, BLOCK_SIZE_K)) + input_arranged = input_arranged.tile((1, -1)) + input_arranged = input_arranged.expand((-1, output_arranged.shape[1])) + input_arranged.dtype = input_arranged.dtype.squeeze(0) + + # other: (K, N) -> (BLOCK_SIZE_K, BLOCK_SIZE_N) -> tile(-1, 1) -> expand(M_blocks, -1) + other_arranged = other.tile((BLOCK_SIZE_K, BLOCK_SIZE_N)) + other_arranged = other_arranged.tile((-1, 1)) + other_arranged = other_arranged.expand((output_arranged.shape[0], -1)) + other_arranged.dtype = other_arranged.dtype.squeeze(1) + + return input_arranged, other_arranged, output_arranged +``` + +#### 模式 D:3D 分块 Matmul(bmm) + +在 mm 前加一个 `tile((1, ...))` 保留 batch 维度: + +```python +def arrangement( + input, other, output, + BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, + BLOCK_SIZE_K=BLOCK_SIZE_K +): + output_arranged = output.tile((1, BLOCK_SIZE_M, BLOCK_SIZE_N)) + output_arranged.dtype = output_arranged.dtype.squeeze(0) + + input_arranged = input.tile((1, BLOCK_SIZE_M, BLOCK_SIZE_K)) + input_arranged = input_arranged.tile((1, 1, -1)) + input_arranged = input_arranged.expand((-1, -1, output_arranged.shape[-1])) + input_arranged.dtype = input_arranged.dtype.squeeze((0, 1)) + input_arranged.dtype.dtype = input_arranged.dtype.dtype.squeeze(0) + + other_arranged = other.tile((1, BLOCK_SIZE_K, BLOCK_SIZE_N)) + other_arranged = other_arranged.tile((1, -1, 1)) + other_arranged = other_arranged.expand((-1, output_arranged.shape[-2], -1)) + other_arranged.dtype = other_arranged.dtype.squeeze((0, 2)) + other_arranged.dtype.dtype = other_arranged.dtype.dtype.squeeze(0) + + return input_arranged, other_arranged, output_arranged +``` + +#### 模式 E:自定义 strides/dilation tile(RoPE) + +用于非连续内存访问,如 Rotary Position Embedding 中的交错/非交错模式: + +```python +def arrangement(input, sin_table, cos_table, interleaved=True): + emb_dim = input.shape[-1] + tile_shape = (1, 1, 1, emb_dim // 2) + + if interleaved: + strides = (-1, -1, -1, 1) + dilation = (1, 1, 1, 2) + else: + strides = None + dilation = None + + input_arranged = input.tile(tile_shape, strides=strides, dilation=dilation) + input_arranged = input_arranged.tile((1, 1, 1, 2)) + input_arranged.dtype = input_arranged.dtype.squeeze((0, 1, 2)) + input_arranged.dtype.dtype = input_arranged.dtype.dtype.squeeze((0, 1, 2)) + + sin_table_arranged = sin_table.tile(tile_shape) + sin_table_arranged.dtype = sin_table_arranged.dtype.squeeze((0, 1, 2)) + + cos_table_arranged = cos_table.tile(tile_shape) + cos_table_arranged.dtype = cos_table_arranged.dtype.squeeze((0, 1, 2)) + + return input_arranged, sin_table_arranged, cos_table_arranged +``` + +#### 模式 F:窗口 + ravel + flatten(max_pool2d) + +先 tile 窗口,再 ravel+flatten 将窗口内元素展平,最后 tile 做 block: + +```python +def arrangement(input, output): + input_arranged = input.tile((1, 1, WINDOW_HEIGHT, WINDOW_WIDTH)) + input_arranged = input_arranged.ravel() + input_arranged = input_arranged.flatten(end_dim=4).flatten(start_dim=1) + input_arranged = input_arranged.tile((BLOCK_SIZE, -1)) + + output_arranged = output.tile((1, 1, 1, 1)) + output_arranged = output_arranged.ravel() + output_arranged = output_arranged.flatten(end_dim=4).flatten(start_dim=1) + output_arranged = output_arranged.tile((BLOCK_SIZE, -1)) + output_arranged.dtype = output_arranged.dtype.squeeze(1) + + return input_arranged, output_arranged +``` + +#### 模式 G:复用作现有 arrangement(addmm) + +```python +# addmm —— 复用 mm.arrangement,额外传 input/beta/alpha +def arrangement(input, mat1, mat2, beta, alpha, output): + _, _, input_arranged = mm.arrangement(mat1, mat2, input) + mat1_arranged, mat2_arranged, output_arranged = mm.arrangement(mat1, mat2, output) + return input_arranged, mat1_arranged, mat2_arranged, beta, alpha, output_arranged +``` + +#### 模式 H:conv2d —— im2col 式 flatten + 复用 mm + +```python +def arrangement(input, filter, output): + input_arranged = input.tile((1, *filter.shape[1:]), strides=(-1, -1, 1, 1)) + input_arranged = input_arranged.squeeze(1) + input_arranged.dtype = input_arranged.dtype.squeeze(0) + input_arranged = input_arranged.ravel() + input_arranged = input_arranged.flatten(end_dim=3).flatten(start_dim=1) + + filter_arranged = filter.flatten(start_dim=1) + filter_arranged = filter_arranged.permute((1, 0)) + + output_arranged = output.permute((0, 2, 3, 1)).flatten(end_dim=3) + + return mm.arrangement(input_arranged, filter_arranged, output_arranged) +``` + +#### 模式 I:Attention —— online softmax + +```python +def arrangement( + q, k, v, scale, o, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N +): + def arrange_q_or_o(input): + arranged = input.tile((1, 1, BLOCK_SIZE_M, -1)) + arranged.dtype = arranged.dtype.squeeze((0, 1)) + return arranged + + def arrange_k_or_v(input): + arranged = input.tile((1, 1, BLOCK_SIZE_N, -1)) + arranged = arranged.tile((1, 1, -1, -1)) + arranged = arranged.expand((-1, -1, q_arranged.shape[-2], -1)) + arranged.dtype = arranged.dtype.squeeze((0, 1, 3)) + arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1)) + return arranged + + q_arranged = arrange_q_or_o(q) + return q_arranged, arrange_k_or_v(k), arrange_k_or_v(v), scale, arrange_q_or_o(o) +``` + +### 4.2 Arrangement 方法速查 + +| 方法 | 作用 | 示例 | +|------|------|------| +| `.tile(shape)` | 按 shape 分块 | `tensor.tile((BLOCK_SIZE,))` | +| `.tile(shape, strides=..., dilation=...)` | 自定义步幅/膨胀分块 | RoPE 交错访问 | +| `.expand(shape)` | 广播扩展维度(类似 torch.expand) | mm 中 K 维对齐 | +| `.squeeze(dim)` | 在 arrangement 的 dtype 上下文删除维度 | `dtype.squeeze(0)` | +| `.ravel()` | 将连续内存 tile 展平为一维 | pool2d 窗口展平 | +| `.flatten(start_dim, end_dim)` | 展平指定维度范围 | conv2d 的 im2col | +| `.permute(axes)` | 重排维度顺序 | conv2d filter/output 变换 | + +### 4.3 `.dtype` 操作详解 + +`arranged_tensor.dtype` 是一个代理对象,用来在符号层面描述数据类型和维度关系。 + +```python +# 删掉某个轴(该轴大小变为 1 时有效) +arranged.dtype = arranged.dtype.squeeze(0) +arranged.dtype = arranged.dtype.squeeze((0, 1)) + +# 进一步操作深层 dtype(多重 tile/expand 后) +arranged.dtype.dtype = arranged.dtype.dtype.squeeze(0) +arranged.dtype.dtype = arranged.dtype.dtype.squeeze((0, 1, 2)) +``` + +--- + +## 5. Application —— 计算逻辑模式大全 + +### 5.1 标量算术模式 + +```python +# add —— 逐元素加法 +def application(input, other, output): + output = input + other # noqa: F841 +``` + +### 5.2 激活函数模式 + +```python +# silu: x * sigmoid(x) +def application(input, output): + input_loaded = input + output = input_loaded * ntl.sigmoid(ntl.cast(input_loaded, ntl.float32)) # noqa: F841 + +# swiglu: a * (b * sigmoid(b)) +def application(a, b, c): + b_loaded = b + gate = b_loaded * ntl.sigmoid(ntl.cast(b_loaded, ntl.float32)) + c = a * gate # noqa: F841 +``` + +# GELU approximate(tanh 近似,用 ntl.exp 手动实现 tanh) +```python +def application(input, output): + # tanh 近似: 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3))) + # 所有数值必须用字面量!不能引用 math.pi 或模块级变量(AST 跟踪陷阱) + t = 0.7978845608028654 * (input + 0.044715 * input * input * input) + exp_t = ntl.exp(t) + exp_neg_t = ntl.exp(-t) + output = 0.5 * input * (1.0 + (exp_t - exp_neg_t) / (exp_t + exp_neg_t)) # noqa: F841 + +# GELU exact(标准 erf 公式) +```python +def application(input, output): + output = input * 0.5 * (1.0 + ntl.erf(input / ntl.sqrt(2.0))) # noqa: F841 +``` + +**GELU 注意事项**: +- `x ** 3` 不可用(Triton tensor 不支持 `__pow__`) → 用 `x * x * x` +- `ntl.sqrt(2.0 / math.pi)` 不可用 → 用字面量 `0.7978845608028654` +- `ntl.tanh` 不可用 → 用 `(exp(t)-exp(-t))/(exp(t)+exp(-t))` +- 测试对比:`torch.nn.functional.gelu(x, approximate='tanh')`(近似版)和 `torch.nn.functional.gelu(x)`(精确版) + +### 5.3 归约模式 + +```python +# softmax: online softmax(行级归约) +def application(input, output): + input_loaded = input + row_minus_max = input_loaded - ntl.max(input_loaded) + numerator = ntl.exp(row_minus_max) + denominator = ntl.sum(numerator) + output = numerator / denominator # noqa: F841 + +# max_pool2d: 窗口内 max +def application(input, output): + output = ntl.max(input, axis=1) # noqa: F841 +``` + +### 5.4 矩阵乘模式 + +```python +# mm —— 累加 dot product +def application(input, other, output): + accumulator = ntl.zeros(output.shape, dtype=ntl.float32) + for k in range(input.shape[0]): + accumulator += ntl.dot(input[k], other[k]) + output = accumulator + +# bmm —— 直接复用 mm.application +application = mm.application +``` + +### 5.5 归一化模式 + +```python +# rms_norm: x / sqrt(mean(x^2) + eps) +def application(input, eps, output): + input_fp32 = ntl.cast(input, ntl.float32) + output = input_fp32 * ntl.rsqrt( # noqa: F841 + ntl.sum(input_fp32 * input_fp32) / input.shape[-1] + eps + ) + +# fused_rms_norm: (x / sqrt(mean(x^2) + eps)) * w +def application(x, w, eps, y): + x_fp32 = ntl.cast(x, ntl.float32) + y = x_fp32 * ntl.rsqrt(ntl.sum(x_fp32 * x_fp32) / x.shape[-1] + eps) * w # noqa: F841 +``` + +### 5.6 Attention 模式(Online Softmax Flash Attention) + +```python +def application(q, k, v, scale, o): + # 变换 Q 使其与 K 的乘积可表示为 exp2 形式 + q_loaded = (q * scale * 1.44269504089).to(q.dtype) # log2(e) + acc = ntl.zeros((q.shape[-2], q.shape[-1]), dtype=ntl.float32) + l_i = ntl.full((q.shape[-2],), 1, dtype=ntl.float32) + m_i = ntl.full((q.shape[-2],), float("-inf"), dtype=ntl.float32) + + for i in range(k.shape[0]): + qk = ntl.dot(q_loaded, ntl.trans(k[i])) + qk = ntl.where(k[i].offsets(-2) < k.source.shape[-2], qk, float("-inf")) + m_ij = ntl.maximum(m_i, ntl.max(qk, 1)) + p = ntl.exp2(qk - m_ij[:, None]) + l_ij = ntl.sum(p, 1) + alpha = ntl.exp2(m_i - m_ij) + acc = acc * alpha[:, None] + ntl.dot(p.to(v.dtype.dtype), v[i]) + m_i = m_ij + l_i = l_i * alpha + l_ij + + acc /= l_i[:, None] + o = acc.to(o.dtype) # noqa: F841 +``` + +### 5.7 RoPE 模式(索引级写) + +```python +def application(input, sin_table, cos_table): + sin_table_loaded = sin_table + cos_table_loaded = cos_table + input_0 = input[0] + input_1 = input[1] + input[0] = input_0 * cos_table_loaded - input_1 * sin_table_loaded + input[1] = input_0 * sin_table_loaded + input_1 * cos_table_loaded +``` + +### 5.8 Composite 模式(复用现有 application) + +```python +# addmm —— 先用 mm 算 matmul,再加 bias+scale +def application(input, mat1, mat2, beta, alpha, output): + mm.application(mat1, mat2, output) + output = beta * input + alpha * output +``` + +### 5.9 application 注意事项 + +- **末尾必须用 `# noqa: F841`** 标注赋值,因为 Triton 在汇编阶段才会实际使用变量 +- `x.shape`、`x.dtype` 在 application 中是符号表达式,不是具体数值/类型 +- `x.source` 可访问到原始未 tile 的张量元信息 +- `.offsets(dim)` 返回当前块在指定维度的起始偏移量 +- 所有运算都是**符号化**的,最终由 Triton JIT 编译为 GPU 代码 + +--- + +## 6. Kernel 创建与调用 + +### 6.1 标准创建 + +```python +kernel = ninetoothed.make(arrangement, application, tensors) +``` + +### 6.2 多 kernel 分支(RoPE) + +```python +interleaved_kernel = ninetoothed.make( + functools.partial(arrangement, interleaved=True), application, inputs +) +non_interleaved_kernel = ninetoothed.make( + functools.partial(arrangement, interleaved=False), application, inputs +) + +def kernel(input, sin_table, cos_table, interleaved=True): + return (interleaved_kernel if interleaved else non_interleaved_kernel)( + input, sin_table, cos_table + ) +``` + +### 6.3 调用 kernel + +```python +# constexpr 符号作为 kwargs 传入 +kernel(input, other, output, BLOCK_SIZE=1024) + +# meta/block_size 符号无需传入,由 autotune 自动搜索 +kernel(mat1, mat2, output) +``` + +--- + +## 7. Torch 包装层模式 + +### 7.1 Element-wise(add, silu) + +```python +def add(input, other): + output = torch.empty_like(input) + ops.ninetoothed.kernels.add.kernel(input, other, output, BLOCK_SIZE=1024) + return output +``` + +### 7.2 Flatten 后调用(silu, swiglu) + +```python +def silu(input): + input_flat = input.flatten() + output_flat = torch.empty_like(input_flat) + ops.ninetoothed.kernels.silu.kernel(input_flat, output_flat, BLOCK_SIZE=1024) + return output_flat.view_as(input) +``` + +### 7.3 Matmul(mm, bmm, addmm) + +```python +def mm(input, other): + output_shape = (input.shape[0], other.shape[1]) + output = torch.empty(output_shape, dtype=input.dtype, device=input.device) + ops.ninetoothed.kernels.mm.kernel(input, other, output) + return output +``` + +### 7.4 Norm(rms_norm, fused_rms_norm) + +```python +def fused_rms_norm(x, w, eps=None): + if eps is None: + eps = torch.finfo(x.dtype).eps() + x_2d = x.view(-1, x.shape[-1]) + w_2d = w.expand_as(x_2d) + y_2d = torch.empty_like(x_2d) + ops.ninetoothed.kernels.fused_rms_norm.kernel(x_2d, w_2d, eps, y_2d, BLOCK_SIZE=x.shape[-1]) + return y_2d.view(x.shape) +``` + +### 7.5 Conv2d + +```python +def conv2d(input, filter): + n, _, h, w = input.shape + k, _, r, s = filter.shape + p, q = h - r + 1, w - s + 1 + output = torch.empty((n, k, p, q), dtype=input.dtype, device=input.device) + ops.ninetoothed.kernels.conv2d.kernel(input, filter, output) + return output +``` + +### 7.6 Attention + +```python +def scaled_dot_product_attention(q, k, v, scale=None): + if scale is None: + scale = 1 / math.sqrt(q.shape[-1]) + o = torch.empty_like(q) + ops.ninetoothed.kernels.scaled_dot_product_attention.kernel(q, k, v, scale, o) + return o +``` + +### 7.7 Pooling + +```python +def max_pool2d(input, window_shape): + n, c, h, w = input.shape + r, s = window_shape + p = math.ceil((h - r) / r + 1) + q = math.ceil((w - s) / s + 1) + output = torch.empty(n, c, p, q, dtype=input.dtype, device=input.device) + ops.ninetoothed.kernels.max_pool2d.kernel(input, output, WINDOW_HEIGHT=r, WINDOW_WIDTH=s) + return output +``` + +--- + +## 8. AST 跟踪陷阱(Nineteethed DSL 特有) + +Nineteethed 使用 Python AST 跟踪来生成 Triton 代码。`application()` 函数中**出现的所有 Python 变量名都会被原样嵌入生成的 Triton 代码**,而 Triton 编译环境中没有标准 Python 模块。 + +### 8.1 常见错误模式 + +```python +# ❌ 错误:math 模块在 Triton 中不存在 +def application(input, output): + t = ntl.sqrt(2.0 / math.pi) * input # NameError: name 'math' is not defined + +# ❌ 错误:模块级变量也被嵌入 +_SQRT_2_OVER_PI = math.sqrt(2.0 / math.pi) +def application(input, output): + t = _SQRT_2_OVER_PI * input # NameError: name '_SQRT_2_OVER_PI' is not defined + +# ❌ 错误:__pow__ 在 Triton tensor 上不存在 +def application(input, output): + t = input ** 3 # AttributeError: 'tensor' object has no attribute '__pow__' + +# ❌ 错误:ntl.tanh 不存在 +def application(input, output): + output = ntl.tanh(input) # AttributeError +``` + +### 8.2 正确做法 + +```python +# ✅ 正确:使用字面量数值,inline 所有常量 +def application(input, output): + t = 0.7978845608028654 * (input + 0.044715 * input * input * input) + exp_t = ntl.exp(t) + exp_neg_t = ntl.exp(-t) + output = 0.5 * input * (1.0 + (exp_t - exp_neg_t) / (exp_t + exp_neg_t)) # noqa: F841 +``` + +### 8.3 规则总结 + +| 规则 | 说明 | +|------|------| +| ✅ 允许 | 字面量数值(`1.0`, `0.5`)、`ntl.*` API、基础运算符(`+`, `-`, `*`, `/`) | +| ❌ 禁止 | `math.*`、`torch.*`、模块级 Python 变量、`**` 运算符、NumPy 函数 | +| 🔧 替代方案 | 字面量 inline、`x * x * x` 替代 `x ** 3`、`ntl.exp` 组合替代 `ntl.tanh` | + +--- + +## 9. ntl 语言速查 + +```python +import ninetoothed.language as ntl +``` + +### 8.1 类型转换 + +| API | 说明 | +|-----|------| +| `ntl.cast(x, ntl.float32)` | 显式类型转换 | +| `x.to(new_dtype)` | 隐式类型转换 | +| `ntl.float32` | float32 类型常量 | + +### 8.2 数学运算 + +| API | 说明 | +|-----|------| +| `ntl.sigmoid(x)` | Sigmoid 激活函数 | +| `ntl.exp(x)` | 自然指数 | +| `ntl.exp2(x)` | 2 的幂 | +| `ntl.rsqrt(x)` | 1/sqrt(x) | +| `ntl.max(x)` | 最大值(单输入) | +| `ntl.max(x, axis=N)` | 沿指定轴最大值 | +| `ntl.maximum(a, b)` | 逐元素最大值(二元) | +| `ntl.sum(x)` | 求和 | +| `ntl.sum(x, axis=N)` | 沿轴求和 | +| `ntl.dot(a, b)` | 矩阵乘法(二维块级) | +| `ntl.trans(x)` | 转置 | +| `ntl.where(cond, x, y)` | 条件选择 | + +### 8.3 初始化/创建 + +| API | 说明 | +|-----|------| +| `ntl.zeros(shape, dtype)` | 全零(在 application 内初始化累加器) | +| `ntl.full(shape, value, dtype)` | 填充指定值 | + +### 8.4 张量元信息 + +| 属性 | 说明 | +|------|------| +| `x.shape` | tile 之后的局部形状 | +| `x.dtype` | tile 之后的数据类型 | +| `x.source` | 原始(未 tile)张量元信息 | +| `x.offsets(dim)` | 当前块在 dim 维度的起始偏移 | + +--- + +## 10. 常见模式速查 + +### 10.1 1D Element-wise 模式 + +```python +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.tile((BLOCK_SIZE,)), output.tile((BLOCK_SIZE,)) +def application(input, output): + output = # noqa: F841 +tensors = (Tensor(1), Tensor(1)) +kernel = ninetoothed.make(arrangement, application, tensors) +``` + +### 10.2 2D 行归约模式 + +```python +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.tile((1, BLOCK_SIZE)), output.tile((1, BLOCK_SIZE)) +def application(input, output): + output = # noqa: F841 +tensors = (Tensor(2, other=float("-inf")), Tensor(2)) +kernel = ninetoothed.make(arrangement, application, tensors) +``` + +### 10.3 Matmul 模式 + +```python +BLOCK_SIZE_M = block_size() +BLOCK_SIZE_N = block_size() +BLOCK_SIZE_K = block_size() +# ... mm arrangement + 循环 dot +tensors = (Tensor(2), Tensor(2), Tensor(2)) +kernel = ninetoothed.make(arrangement, application, tensors) +``` + +### 9.4 复用模式 + +```python +# 复用 application +from somewhere import application as base_application +tensors = ... +kernel = ninetoothed.make(arrangement, base_application, tensors) + +# 复用 arrangement + application +from somewhere import arrangement as base_arrangement, application as base_application +tensors = ... +kernel = ninetoothed.make(base_arrangement, base_application, tensors) +``` + +### 9.5 包装层 flatten 模式 + +```python +def op(input): + flat = input.flatten() + out_flat = torch.empty_like(flat) + kernel(flat, out_flat, BLOCK_SIZE=1024) + return out_flat.view_as(input) +``` + +### 9.6 标量参数传递 + +```python +# Tensor(0) 标量在 arrangement 中直接返回 +def arrangement(x, eps, y): + return x.tile((1, BLOCK_SIZE)), eps, y.tile((1, BLOCK_SIZE)) + +# kernel 调用时直接传值 +kernel(input, eps_value, output, BLOCK_SIZE=1024) +``` + +--- + +## 11. 完整的算子开发工作流 + +### 步骤 1:创建 kernel 文件 + +``` +ops/ninetoothed/kernels/my_op.py +``` + +### 步骤 2:实现 arrangement + application + +```python +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.tile((BLOCK_SIZE,)), output.tile((BLOCK_SIZE,)) + +def application(input, output): + output = ntl.sigmoid(input) # noqa: F841 + +tensors = (Tensor(1), Tensor(1)) +kernel = ninetoothed.make(arrangement, application, tensors) +``` + +### 步骤 3:添加 torch 包装 + +在 `ops/ninetoothed/torch.py` 中添加: + +```python +def my_op(input): + flat = input.flatten() + out_flat = torch.empty_like(flat) + ops.ninetoothed.kernels.my_op.kernel(flat, out_flat, BLOCK_SIZE=1024) + return out_flat.view_as(input) +``` + +### 步骤 4:测试 + +```python +import torch +from ops.ninetoothed.torch import my_op + +x = torch.randn(4, 128, device="cuda") +result = my_op(x) +expected = torch.sigmoid(x) +assert torch.allclose(result, expected, atol=1e-5) +print("PASS") +``` + +--- + +## 12. 调试技巧 + +1. **检查 tile 形状**:在 arrangement 中打印 `x.shape` 观察分块是否符合预期 +2. **dtype 操作链**:多层 `tile/expand` 后可能需要 `squeeze` 多次才能恢复正确的 dtype 结构 +3. **noqa: F841**:application 中每个赋值语句都需要标注,否则 Python 字节码分析会警告 +4. **`x.source`**:当需要引用原始张量信息时使用,例如 attention 中的 mask 边界检查 +5. **`offsets(dim)`**:用于在 application 中判断当前块是否越界(如 attention 中 `k[i].offsets(-2) < k.source.shape[-2]`) +6. **BLOCK_SIZE 选择策略**: + - element-wise:256~1024 + - matmul:M/N 64~128,K 32~64 (通常由 autotune 自动搜索) + - softmax/reduction:取 `input.shape[-1]` 作为 BLOCK_SIZE(覆盖整行) +7. **测试时注意非连续张量**:如果包装层没有显式 `.contiguous()`,需确保 arrangement 正确处理 stride diff --git a/skills/competition/nineops-skills/examples/elementwise_broadcast_add/README.md b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/README.md new file mode 100644 index 0000000..8332393 --- /dev/null +++ b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/README.md @@ -0,0 +1,33 @@ +# Element-wise Broadcast Add — 示例场景 + +## 目标 + +实现一个支持广播的 element-wise add kernel,作为 `elementwise_1d` 模式的展示。 + +## 任务描述 + +实现 `ninetoothed_add` 函数,接受两个 tensor,支持广播语义,使用 `elementwise_1d` 模板。 + +## 关键点 + +1. **Broadcast** — 第二个输入可以比第一个输入小,需要自动 broadcast +2. **Mask 处理** — 边界处理 +3. **BLOCK_SIZE** — 使用 1024 或 autotune + +## 运行 + +```bash +python examples/elementwise_broadcast_add/run.py # 测试正确性 +python examples/elementwise_broadcast_add/benchmark.py # 基准测试 +``` + +## 预期结果 + +- 所有 correctness 测试通过(fp32/fp16, 各种 shape, 各种 broadcast 场景) +- Benchmark 至少达到 PyTorch CUDA 80% 的性能 + +## Torture Test 提示 + +- scalar broadcast: `y.shape=(1,)` with `x.shape=(N,)` +- strided broadcast: `x.shape=(128, 64)`, `y.shape=(64,)` broadcasting across dim=-1 +- non-contiguous: `y = y.as_strided(...)` with non-standard strides diff --git a/skills/competition/nineops-skills/examples/elementwise_broadcast_add/benchmark.py b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/benchmark.py new file mode 100644 index 0000000..900a7cf --- /dev/null +++ b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/benchmark.py @@ -0,0 +1,132 @@ +""" +elementwise_broadcast_add/benchmark.py +Broadcast add kernel 的 benchmark。 + +对照要素: + 1. 算子: element-wise add with broadcast + 2. 输入规模: 多种 shape 和 dtype 组合 + 3. 硬件: 自动检测 + 4. 布局: BLOCK_SIZE=1024 + 5. Baseline: torch.add (CUDA) + 6. Ninetoothed: 本实现 + 7. 实现差异: tile 布局选择 + 8. Fallback: N/A +""" + +import torch +from time import perf_counter + +# 尝试导入 kernel,失败则 fallback 到 demo +try: + from run import make_broadcast_add_elementwise_1d +except ImportError: + import sys + sys.path.insert(0, ".") + from run import make_broadcast_add_elementwise_1d + + +def benchmark(shape_a, shape_b, dtype, label, warmup=10, repeats=100): + """Benchmark kernel vs torch add.""" + x = torch.randn(shape_a, device="cuda", dtype=dtype) + y = torch.randn(shape_b, device="cuda", dtype=dtype) + out = torch.empty(shape_a if len(shape_a) >= len(shape_b) else shape_b, + device="cuda", dtype=dtype) + kernel = make_broadcast_add_elementwise_1d() + blk = 1024 if max(out.shape) <= 1024 else 2048 + + # 预热 + for _ in range(warmup): + kernel(x, y, out, BLOCK_SIZE=blk) + x + y + torch.cuda.synchronize() + + # 测量 kernel + start = perf_counter() + for _ in range(repeats): + kernel(x, y, out, BLOCK_SIZE=blk) + torch.cuda.synchronize() + kernel_ms = (perf_counter() - start) / repeats * 1000 + + # 测量 torch + start = perf_counter() + for _ in range(repeats): + x + y + torch.cuda.synchronize() + torch_ms = (perf_counter() - start) / repeats * 1000 + + speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0 + + print(f"| {label:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |") + + return {"label": label, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup} + + +def main(): + print("=" * 75) + print("Broadcast Add Benchmark") + print("=" * 75) + gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU" + print(f"GPU: {gpu_name}") + print(f"CUDA: {torch.version.cuda}") + print() + print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |") + print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|") + + results = [] + + # FP32, various sizes + sizes = [(1024,), (4096,), (65536,), (131072,)] + for s in sizes: + results.append(benchmark(s, s, torch.float32, f"({s[0]},) fp32")) + + # FP16 + results.append(benchmark((4096,), (4096,), torch.float16, "(4096,) fp16")) + results.append(benchmark((65536,), (65536,), torch.float16, "(65536,) fp16")) + + # BF16 (if available) + if hasattr(torch, 'bfloat16'): + try: + results.append(benchmark((4096,), (4096,), torch.bfloat16, "(4096,) bf16")) + except Exception: + print(f"| {'(4096,) bf16':35s} | {'N/A':>8s} | {'N/A':>8s} | {'N/A':>5s} |") + + # 广播场景 + results.append(benchmark((4096,), (1,), torch.float32, "(4096,) + (1,) scalar")) + results.append(benchmark((256, 768), (768,), torch.float32, "(256,768) + (768,) vec")) + results.append(benchmark((4, 128, 256), (256,), torch.float32, "(4,128,256) + (256,)")) + results.append(benchmark((4, 128, 256), (1,), torch.float32, "(4,128,256) + (1,)")) + + # 非连续场景 + x = torch.randn(768, 256, device="cuda") + y = torch.randn(256, device="cuda") + # Transposed + x_t = x.t().contiguous().t() # force non-contiguous + kernel = make_broadcast_add_elementwise_1d() + out = torch.empty(256, 768, device="cuda") + + warmup, repeats = 10, 100 + for _ in range(warmup): + kernel(x_t, y, out, BLOCK_SIZE=768) + x_t + y + torch.cuda.synchronize() + + start = perf_counter() + for _ in range(repeats): + kernel(x_t, y, out, BLOCK_SIZE=768) + torch.cuda.synchronize() + k_ms = (perf_counter() - start) / repeats * 1000 + + start = perf_counter() + for _ in range(repeats): + x_t + y + torch.cuda.synchronize() + t_ms = (perf_counter() - start) / repeats * 1000 + + print(f"| {'(256,768).T + (768,)':35s} | {k_ms:8.3f} | {t_ms:8.3f} | {t_ms/k_ms:5.2f}x |") + + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/skills/competition/nineops-skills/examples/elementwise_broadcast_add/run.py b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/run.py new file mode 100644 index 0000000..a351a83 --- /dev/null +++ b/skills/competition/nineops-skills/examples/elementwise_broadcast_add/run.py @@ -0,0 +1,134 @@ +""" +elementwise_broadcast_add/run.py +Broadcast add kernel: elementwise_1d 模式的标准实现。 + +测试覆盖: + - fp32/fp16/bf16 + - contiguous / non-contiguous (strided) + - broadcast scenarios (scalar, vector, 3D) + - shape 不能整除 BLOCK_SIZE 的边界情况 +""" + +import torch +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + +# ── 方法 A: 使用 elementwise_1d 模板 ────────────────────────────── + +def make_broadcast_add_elementwise_1d(): + """ + Elementwise-1D 模式: 所有 tensor 沿最后一维做 tile((BLOCK_SIZE,))。 + """ + BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE): + return ( + input.tile((BLOCK_SIZE,)), + other.tile((BLOCK_SIZE,)), + output.tile((BLOCK_SIZE,)), + ) + + def application(input, other, output): + output = input + other # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(1), Tensor(1), Tensor(1)), + ) + +# ── 方法 B: 手动 broadcast 展开(用于理解和对比)─────────────────── + +def make_broadcast_add_manual(): + BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE): + return ( + input.tile((BLOCK_SIZE,)), + other.tile((BLOCK_SIZE,)), + output.tile((BLOCK_SIZE,)), + ) + + def application(input, other, output): + output = input + other # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(1), Tensor(1), Tensor(1)), + ) + + +# ── 使用示例 ────────────────────────────────────────────────────── + +def demo(): + print("=" * 50) + print("Broadcast Add — Elementwise 1D 示例") + print("=" * 50) + + # 创建 kernel + kernel = make_broadcast_add_elementwise_1d() + print("✅ Kernel created") + + # 测试 1: 基本加法 + x = torch.randn(4096, device="cuda") + y = torch.randn(4096, device="cuda") + out = torch.empty(4096, device="cuda") + kernel(x, y, out, BLOCK_SIZE=1024) + expected = x + y + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Basic add failed" + print("✅ Basic add: contiguous, same shape — PASS") + + # 测试 2: Scalar broadcast + y_scalar = torch.randn(1, device="cuda") + out = torch.empty(4096, device="cuda") + kernel(x, y_scalar, out, BLOCK_SIZE=1024) + expected = x + y_scalar + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Scalar broadcast failed" + print("✅ Scalar broadcast: (4096,) + (1,) — PASS") + + # 测试 3: Vector broadcast (2D + 1D) + x2d = torch.randn(128, 256, device="cuda") + y1d = torch.randn(256, device="cuda") + kernel_2d = make_broadcast_add_elementwise_1d() + out = torch.empty(128, 256, device="cuda") + kernel_2d(x2d, y1d, out, BLOCK_SIZE=256) + expected = x2d + y1d + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "2D+1D broadcast failed" + print("✅ 2D+1D broadcast: (128, 256) + (256,) — PASS") + + # 测试 4: Non-contiguous (transposed) + x_t = torch.randn(256, 128, device="cuda").t() # shape (128, 256) + y_t = torch.randn(128, device="cuda") + kernel_nc = make_broadcast_add_elementwise_1d() + out = torch.empty(128, 256, device="cuda") + kernel_nc(x_t, y_t, out, BLOCK_SIZE=256) + expected = x_t + y_t + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Non-contiguous broadcast failed" + print("✅ Non-contiguous: (128, 256).T + (128,) — PASS") + + # 测试 5: 边界情况 — shape 不能被 BLOCK_SIZE 整除 + x_small = torch.randn(100, device="cuda") + y_small = torch.randn(1, device="cuda") + out = torch.empty(100, device="cuda") + kernel(x_small, y_small, out, BLOCK_SIZE=128) + expected = x_small + y_small + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Uneven shape failed" + print("✅ Uneven shape: (100,) + (1,) — PASS") + + # 测试 6: FP16 + x_fp16 = torch.randn(4096, device="cuda", dtype=torch.float16) + y_fp16 = torch.randn(4096, device="cuda", dtype=torch.float16) + out = torch.empty(4096, device="cuda", dtype=torch.float16) + kernel(x_fp16, y_fp16, out, BLOCK_SIZE=1024) + expected = x_fp16 + y_fp16 + assert torch.allclose(out, expected, atol=1e-3, rtol=1e-3), "FP16 failed" + print("✅ FP16: contiguous — PASS") + + print() + print("🎉 所有测试通过!") + + +if __name__ == "__main__": + demo() diff --git a/skills/competition/nineops-skills/examples/non_contiguous_stride_case/README.md b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/README.md new file mode 100644 index 0000000..5e3c927 --- /dev/null +++ b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/README.md @@ -0,0 +1,42 @@ +# Non-Contiguous Stride Case — 示例场景 + +## 目标 + +测试和验证 ninetoothed 在 non-contiguous 输入下的行为,作为 "stride 正确处理" 的展示。 + +## 任务描述 + +ninotoothed 在 non-contiguous tensor 上是否能正确计算 stride 和偏移?本场景通过 add kernel 测试多种 strided 模式。 + +## 覆盖的 stride 变体 + +| 变体 | 创建方式 | 预期 | +|------|----------|------| +| contiguous | `torch.randn(M, N)` | ✅ 基准 | +| transposed | `.t()` | ✅ stride 交换 | +| sliced | `x[::2, :]` | ✅ offset 步进 | +| view | `.view()` | ✅ 连续视图 | +| expanded | `.expand()` | ✅ 广播扩展 | +| permuted | `.permute(1, 0)` | ✅ 维度重排 | +| as_strided | `.as_strided(...)` | ✅ 自定义 stride | + +## 核心问题 + +对于 non-contiguous tensor,ninetoothed 的 underlying ptr 计算是否为: + +``` +ptr + row * stride_row + col * stride_col +``` + +而不是: + +``` +ptr + row * cols + col // 仅用于 contiguous +``` + +## 运行 + +```bash +python examples/non_contiguous_stride_case/run.py +python examples/non_contiguous_stride_case/benchmark.py +``` diff --git a/skills/competition/nineops-skills/examples/non_contiguous_stride_case/benchmark.py b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/benchmark.py new file mode 100644 index 0000000..c36848f --- /dev/null +++ b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/benchmark.py @@ -0,0 +1,78 @@ +""" +non_contiguous_stride_case/benchmark.py +Non-contiguous tensor 的 benchmark: 对比 contiguous vs strided 加法的性能差异。 +""" + +import torch +from time import perf_counter + +try: + from run import make_add_kernel, make_2d_add_kernel +except ImportError: + import sys + sys.path.insert(0, ".") + from run import make_add_kernel, make_2d_add_kernel + + +def benchmark(name, x, y=None, warmup=10, repeats=100): + if y is None: + y = torch.randn(x.shape[-1], device="cuda") + kernel = make_add_kernel() if x.ndim == 1 else make_2d_add_kernel() + out = torch.empty_like(x) + BLOCK_SIZE = 1024 + + for _ in range(warmup): + kernel(x, y, out, BLOCK_SIZE=BLOCK_SIZE) + x + y + torch.cuda.synchronize() + + start = perf_counter() + for _ in range(repeats): + kernel(x, y, out, BLOCK_SIZE=BLOCK_SIZE) + torch.cuda.synchronize() + kernel_ms = (perf_counter() - start) / repeats * 1000 + + start = perf_counter() + for _ in range(repeats): + x + y + torch.cuda.synchronize() + torch_ms = (perf_counter() - start) / repeats * 1000 + + speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0 + print(f"| {name:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |") + + return {"name": name, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup} + + +def main(): + print("=" * 75) + print("Non-Contiguous Add Benchmark") + print("=" * 75) + gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU" + print(f"GPU: {gpu} | CUDA: {torch.version.cuda}") + print() + print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |") + print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|") + + M, N = 256, 1024 + + baseline = torch.randn(M, N, device="cuda") + y = torch.randn(N, device="cuda") + + benchmark(f"contiguous ({M}, {N})", baseline, y) + benchmark(f"transposed ({N}, {M}).t()", torch.randn(N, M, device="cuda").t(), y) + benchmark(f"sliced rows [::2]", torch.randn(M*2, N, device="cuda")[::2, :], y) + benchmark(f"sliced cols [::3]", torch.randn(M, N*3, device="cuda")[:, ::3], y) + benchmark(f"both sliced", torch.randn(M*2, N*3, device="cuda")[::2, ::3], y) + + # 广播场景 + y_scalar = torch.randn(1, device="cuda") + benchmark(f"contiguous + scalar (1,)", baseline, y_scalar) + benchmark(f"transposed + scalar (1,)", torch.randn(N, M, device="cuda").t(), y_scalar) + + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/skills/competition/nineops-skills/examples/non_contiguous_stride_case/run.py b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/run.py new file mode 100644 index 0000000..735afba --- /dev/null +++ b/skills/competition/nineops-skills/examples/non_contiguous_stride_case/run.py @@ -0,0 +1,161 @@ +""" +non_contiguous_stride_case/run.py +测试 ninetoothed 在 non-contiguous tensor 上的 stride 处理。 + +使用 add kernel(最简单的 elementwise)来聚焦 stride 行为测试。 +""" + +import torch +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + + +def make_add_kernel(): + """简单的 elementwise add kernel,用于测试 stride。""" + BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE): + return ( + input.tile((BLOCK_SIZE,)), + other.tile((BLOCK_SIZE,)), + output.tile((BLOCK_SIZE,)), + ) + + def application(input, other, output): + output = input + other # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(1), Tensor(1), Tensor(1)), + ) + + +def make_2d_add_kernel(): + """2D version, tile per row.""" + BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + def arrangement(input, other, output, BLOCK_SIZE=BLOCK_SIZE): + return ( + input.tile((1, BLOCK_SIZE)), + other.tile((1, BLOCK_SIZE)), + output.tile((1, BLOCK_SIZE)), + ) + + def application(input, other, output): + output = input + other # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(2), Tensor(2), Tensor(2)), + ) + + +def test_stride_case(name, x, y=None, kernel=None, BLOCK_SIZE=1024): + """通用 stride 测试函数。 + + 因为现在 kernel 需要 output 参数,由本函数自动分配。 + """ + if y is None: + y = torch.randn(x.shape[-1], device=x.device) + if kernel is None: + kernel = make_add_kernel() if x.ndim == 1 else make_2d_add_kernel() + + out = torch.empty_like(x) + try: + kernel(x, y, out, BLOCK_SIZE=BLOCK_SIZE) + expected = x + y + ok = torch.allclose(out, expected, atol=1e-5, rtol=1e-5) + status = "✅" if ok else "❌" + extra = f" | strides: {x.stride()}" if not ok else "" + print(f" {status} {name}{extra}") + return ok + except Exception as e: + print(f" ❌ {name} — CRASH: {e}") + import traceback + traceback.print_exc() + return False + + +def demo(): + print("=" * 60) + print("Stride / Non-Contiguous 测试") + print("=" * 60) + + device = "cuda" + results = [] + + # ── 1D tests ── + print("\n--- 1D ---") + kernel_1d = make_add_kernel() + + results.append(test_stride_case("contiguous 1D", + torch.randn(4096, device=device), kernel=kernel_1d)) + results.append(test_stride_case("sliced 1D [::2]", + torch.randn(8192, device=device)[::2], kernel=kernel_1d)) + results.append(test_stride_case("sliced 1D [::3]", + torch.randn(12288, device=device)[::3], kernel=kernel_1d)) + results.append(test_stride_case("small 1D (100)", + torch.randn(100, device=device), kernel=kernel_1d)) + + # ── 2D tests ── + print("\n--- 2D ---") + kernel_2d = make_2d_add_kernel() + + results.append(test_stride_case("contiguous (128, 256)", + torch.randn(128, 256, device=device), kernel=kernel_2d)) + results.append(test_stride_case("transposed (256, 128).t()", + torch.randn(256, 128, device=device).t(), kernel=kernel_2d)) + results.append(test_stride_case("sliced rows [::2]", + torch.randn(256, 256, device=device)[::2, :], kernel=kernel_2d)) + results.append(test_stride_case("sliced cols [::2]", + torch.randn(128, 512, device=device)[:, ::2], kernel=kernel_2d)) + results.append(test_stride_case("both sliced [::2, ::3]", + torch.randn(256, 768, device=device)[::2, ::3], kernel=kernel_2d)) + + # view + x_view = torch.randn(128, 256, device=device).view(128, 256) + results.append(test_stride_case("view (128, 256)", + x_view, kernel=kernel_2d)) + + # expanded (broadcast in add) + x_base = torch.randn(1, 256, device=device) + y_base = torch.randn(128, 1, device=device) + + # For expanded tensor, we create one then test + x_exp = x_base.expand(128, 256) + y_exp = y_base.expand(128, 256) + kernel_exp = make_2d_add_kernel() + out = torch.empty_like(x_exp) + kernel_exp(x_exp, y_exp, out, BLOCK_SIZE=256) + expected = x_exp + y_exp + ok = torch.allclose(out, expected, atol=1e-5, rtol=1e-5) + status = "✅" if ok else "❌" + print(f" {status} expanded broadcast (1,256)+(128,1) — {'PASS' if ok else 'FAIL'}") + results.append(ok) + + # permuted + x_perm = torch.randn(128, 256, device=device).permute(1, 0) + results.append(test_stride_case("permuted (1,0)", + x_perm, kernel=kernel_2d)) + + # as_strided + x_base2 = torch.randn(256, 256, device=device) + x_as = torch.as_strided(x_base2, (128, 128), (512, 2)) # custom stride + results.append(test_stride_case("as_strided (128,128) stride(512,2)", + x_as, kernel=kernel_2d)) + + # ── Summary ── + print(f"\n{'=' * 60}") + print(f"结果: {sum(results)}/{len(results)} 通过") + + if all(results): + print("✅ 所有 stride 变体均正确处理!") + else: + print(f"❌ {sum(1 for r in results if not r)} 个失败") + + +if __name__ == "__main__": + demo() diff --git a/skills/competition/nineops-skills/examples/performance_regression_case/README.md b/skills/competition/nineops-skills/examples/performance_regression_case/README.md new file mode 100644 index 0000000..7e17ebc --- /dev/null +++ b/skills/competition/nineops-skills/examples/performance_regression_case/README.md @@ -0,0 +1,35 @@ +# Performance Regression Case — 示例场景 + +## 目标 + +模拟和诊断性能退化的场景,作为 "性能基准 + 调试" 流程的完整展示。 + +## 场景设计 + +在 Matmul 2D 模式下实现一个矩阵乘法 kernel,然后引入一个常见性能问题(如 block size 过小,或 load/store 模式不高效),通过 benchmark 对比发现问题。 + +## 流程 + +1. **基线实现** — 使用默认的 matmul_2d 模式(BLOCK_SIZE=128) +2. **退化引入** — 切换 BLOCK_SIZE=16(过小的 tile 导致利用率下降) +3. **诊断** — benchmark 对比,定位性能下降 +4. **修复** — 调整参数或 autotune,恢复性能 + +## 关键观察 + +| 配置 | 预期性能 | 原因 | +|------|----------|------| +| BLOCK_SIZE=128 | ✅ 高 | 充分利用 SM | +| BLOCK_SIZE=16 | ❌ 低 | tile 过小,load/store 开销大 | +| Autotune | ✅ 最高 | Heuristic 选择最优参数 | + +## 运行 + +```bash +python examples/performance_regression_case/run.py # verify correctness +python examples/performance_regression_case/benchmark.py # compare configs +``` + +## 预期输出 + +Benchmark 应清晰显示 BLOCK_SIZE=16 的严重退化,以及 autotune 恢复性能的效果。 diff --git a/skills/competition/nineops-skills/examples/performance_regression_case/benchmark.py b/skills/competition/nineops-skills/examples/performance_regression_case/benchmark.py new file mode 100644 index 0000000..43d600f --- /dev/null +++ b/skills/competition/nineops-skills/examples/performance_regression_case/benchmark.py @@ -0,0 +1,126 @@ +""" +performance_regression_case/benchmark.py +Matmul kernel 的性能对比 benchmark: 展示 BLOCK_SIZE 对性能的影响。 +""" + +import torch +from time import perf_counter + +try: + from run import make_matmul, make_matmul_no_autotune +except ImportError: + import sys + sys.path.insert(0, ".") + from run import make_matmul, make_matmul_no_autotune + + +def benchmark(label, kernel, a, b, warmup=10, repeats=100): + c = torch.empty(a.shape[0], b.shape[1], device="cuda") + for _ in range(warmup): + kernel(a, b, c, bm=16, bn=16, bk=16) + a @ b + torch.cuda.synchronize() + + start = perf_counter() + for _ in range(repeats): + kernel(a, b, c, bm=16, bn=16, bk=16) + torch.cuda.synchronize() + kernel_ms = (perf_counter() - start) / repeats * 1000 + + start = perf_counter() + for _ in range(repeats): + a @ b + torch.cuda.synchronize() + torch_ms = (perf_counter() - start) / repeats * 1000 + + speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0 + print(f"| {label:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |") + + return {"label": label, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup} + + +def main(): + print("=" * 75) + print("Matmul Performance Regression Benchmark") + print("=" * 75) + gpu = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU" + print(f"GPU: {gpu} | CUDA: {torch.version.cuda}") + print() + + shapes = [ + ("(512, 512, 512)", 512, 512, 512), + ("(1024, 1024, 1024)", 1024, 1024, 1024), + ("(2048, 2048, 2048)", 2048, 2048, 2048), + ] + + configs = [ + ("BLOCK=16", 16, 16, 16), + ("BLOCK=32", 32, 32, 32), + ("BLOCK=64x64x32", 64, 64, 32), + ("BLOCK=128x128x32", 128, 128, 32), + ] + + for shape_label, M, N, K in shapes: + print(f"\nMatrix: {shape_label} | dtype: float32") + print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |") + print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|") + + a = torch.randn(M, K, device="cuda") + b = torch.randn(K, N, device="cuda") + + for cfg_label, bm, bn, bk in configs: + kernel = make_matmul_no_autotune(block_size_m=bm, block_size_n=bn, block_size_k=bk) + c = torch.empty(M, N, device="cuda") + kernel(a, b, c, bm=bm, bn=bn, bk=bk) + result = benchmark(cfg_label, kernel, a, b) + + # 也跑 torch 本身的 benchmark 做参考 + warmup, repeats = 10, 100 + for _ in range(warmup): + a @ b + torch.cuda.synchronize() + start = perf_counter() + for _ in range(repeats): + a @ b + torch.cuda.synchronize() + torch_ms = (perf_counter() - start) / repeats * 1000 + print(f"| {'PyTorch cuBLAS (baseline)':35s} | {'---':>8s} | {torch_ms:8.3f} | {'1.00x':>6s} |") + + # 关键对比: BLOCK=16 vs BLOCK=128,突出退化 + print(f"\n{'=' * 75}") + print("性能退化摘要 (BLOCK=16 相比 BLOCK=128 的退化倍数):") + print(f"{'=' * 75}") + for shape_label, M, N, K in shapes: + a = torch.randn(M, K, device="cuda") + b = torch.randn(K, N, device="cuda") + + k16 = make_matmul_no_autotune(16, 16, 16) + k128 = make_matmul_no_autotune(128, 128, 32) + + c16 = torch.empty(M, N, device="cuda") + # 测 k16 + for _ in range(10): k16(a, b, c16, bm=16, bn=16, bk=16) + torch.cuda.synchronize() + start = perf_counter() + for _ in range(100): k16(a, b, c16, bm=16, bn=16, bk=16) + torch.cuda.synchronize() + ms16 = (perf_counter() - start) / 100 * 1000 + + c128 = torch.empty(M, N, device="cuda") + # 测 k128 + for _ in range(10): k128(a, b, c128, bm=128, bn=128, bk=32) + torch.cuda.synchronize() + start = perf_counter() + for _ in range(100): k128(a, b, c128, bm=128, bn=128, bk=32) + torch.cuda.synchronize() + ms128 = (perf_counter() - start) / 100 * 1000 + + ratio = ms16 / ms128 if ms128 > 0 else 0 + print(f" {shape_label}: BLOCK=16 {ms16:.2f}ms vs BLOCK=128 {ms128:.2f}ms → {ratio:.1f}x slower") + + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/skills/competition/nineops-skills/examples/performance_regression_case/run.py b/skills/competition/nineops-skills/examples/performance_regression_case/run.py new file mode 100644 index 0000000..1b3f944 --- /dev/null +++ b/skills/competition/nineops-skills/examples/performance_regression_case/run.py @@ -0,0 +1,143 @@ +""" +performance_regression_case/run.py +Matmul kernel 在 matmul_2d 模式下演示性能退化诊断。 + +正确性验证: matmul with BLOCK_SIZE={16, 32, 64, 128} 均通过。 +性能差异通过 benchmark.py 展示。 +""" + +import torch +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + + +def make_matmul(): + """ + 创建 matmul kernel,BLOCK_SIZE 在调用时传入。 + + 用法: + kernel = make_matmul() + out = torch.empty(M, N, device="cuda") + kernel(a, b, out, BLOCK_SIZE_M=128, BLOCK_SIZE_N=128, BLOCK_SIZE_K=32) + """ + BLOCK_SIZE_M = Symbol("BLOCK_SIZE_M", constexpr=True) + BLOCK_SIZE_N = Symbol("BLOCK_SIZE_N", constexpr=True) + BLOCK_SIZE_K = Symbol("BLOCK_SIZE_K", constexpr=True) + + def arrangement(a, b, c, BLOCK_SIZE_M=BLOCK_SIZE_M, BLOCK_SIZE_N=BLOCK_SIZE_N, BLOCK_SIZE_K=BLOCK_SIZE_K): + output_arranged = c.tile((BLOCK_SIZE_M, BLOCK_SIZE_N)) + + a_arranged = a.tile((BLOCK_SIZE_M, BLOCK_SIZE_K)) + a_arranged = a_arranged.tile((1, -1)) + a_arranged = a_arranged.expand((-1, output_arranged.shape[1])) + a_arranged.dtype = a_arranged.dtype.squeeze(0) + + b_arranged = b.tile((BLOCK_SIZE_K, BLOCK_SIZE_N)) + b_arranged = b_arranged.tile((-1, 1)) + b_arranged = b_arranged.expand((output_arranged.shape[0], -1)) + b_arranged.dtype = b_arranged.dtype.squeeze(1) + + return a_arranged, b_arranged, output_arranged + + def application(a, b, c): + accumulator = ntl.zeros(c.shape, dtype=ntl.float32) + for k in range(a.shape[0]): + accumulator += ntl.dot(a[k], b[k]) + c = accumulator # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(2), Tensor(2), Tensor(2)), + ) + + +def make_matmul_no_autotune(block_size_m=128, block_size_n=128, block_size_k=32): + """固定参数版本。""" + bm = Symbol("bm", constexpr=True) + bn = Symbol("bn", constexpr=True) + bk = Symbol("bk", constexpr=True) + + def arrangement(a, b, c, bm=bm, bn=bn, bk=bk): + output_arranged = c.tile((bm, bn)) + + a_arranged = a.tile((bm, bk)) + a_arranged = a_arranged.tile((1, -1)) + a_arranged = a_arranged.expand((-1, output_arranged.shape[1])) + a_arranged.dtype = a_arranged.dtype.squeeze(0) + + b_arranged = b.tile((bk, bn)) + b_arranged = b_arranged.tile((-1, 1)) + b_arranged = b_arranged.expand((output_arranged.shape[0], -1)) + b_arranged.dtype = b_arranged.dtype.squeeze(1) + + return a_arranged, b_arranged, output_arranged + + def application(a, b, c): + accumulator = ntl.zeros(c.shape, dtype=ntl.float32) + for k in range(a.shape[0]): + accumulator += ntl.dot(a[k], b[k]) + c = accumulator # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(2), Tensor(2), Tensor(2)), + ) + + +def demo(): + print("=" * 60) + print("Performance Regression — Matmul 2D 示例") + print("=" * 60) + + M, N, K = 1024, 1024, 1024 + a = torch.randn(M, K, device="cuda") + b = torch.randn(K, N, device="cuda") + expected = a @ b + + configs = [ + ("BLOCK=16", dict(BLOCK_SIZE_M=16, BLOCK_SIZE_N=16, BLOCK_SIZE_K=16)), + ("BLOCK=32", dict(BLOCK_SIZE_M=32, BLOCK_SIZE_N=32, BLOCK_SIZE_K=32)), + ("BLOCK=64x64x32", dict(BLOCK_SIZE_M=64, BLOCK_SIZE_N=64, BLOCK_SIZE_K=32)), + ("BLOCK=128x128x32", dict(BLOCK_SIZE_M=128, BLOCK_SIZE_N=128, BLOCK_SIZE_K=32)), + ] + + for label, cfg in configs: + kernel = make_matmul() + c = torch.empty(M, N, device="cuda") + kernel(a, b, c, + BLOCK_SIZE_M=cfg["BLOCK_SIZE_M"], + BLOCK_SIZE_N=cfg["BLOCK_SIZE_N"], + BLOCK_SIZE_K=cfg["BLOCK_SIZE_K"]) + out = c + ok = torch.allclose(out, expected, atol=1e-3, rtol=1e-3) + status = "✅" if ok else "❌" + print(f" {status} {label} — M={M}, N={N}, K={K}") + + # 小规模也测试一下 + print() + M2, N2, K2 = 256, 512, 128 + a2 = torch.randn(M2, K2, device="cuda") + b2 = torch.randn(K2, N2, device="cuda") + expected2 = a2 @ b2 + + for label, cfg in configs: + kernel = make_matmul() + c = torch.empty(M2, N2, device="cuda") + kernel(a2, b2, c, + BLOCK_SIZE_M=cfg["BLOCK_SIZE_M"], + BLOCK_SIZE_N=cfg["BLOCK_SIZE_N"], + BLOCK_SIZE_K=cfg["BLOCK_SIZE_K"]) + out = c + ok = torch.allclose(out, expected2, atol=1e-3, rtol=1e-3) + status = "✅" if ok else "❌" + print(f" {status} {label} — M={M2}, N={N2}, K={K2}") + + print() + print("🎉 所有 matmul 变体正确性验证通过!") + + +if __name__ == "__main__": + demo() diff --git a/skills/competition/nineops-skills/examples/reduction_softmax/README.md b/skills/competition/nineops-skills/examples/reduction_softmax/README.md new file mode 100644 index 0000000..24198ad --- /dev/null +++ b/skills/competition/nineops-skills/examples/reduction_softmax/README.md @@ -0,0 +1,43 @@ +# Reduction Softmax — 示例场景 + +## 目标 + +实现一个行级 online softmax kernel,作为 `reduction_2d` 模式的展示。 + +## 任务描述 + +实现 `ninetoothed_softmax` 函数,使用 online softmax 算法,沿最后一维做行归约。 + +## 关键点 + +1. **Online Softmax** — 维护 `m_i` 和 `d_i` 两个状态变量实现数值稳定性 +2. **Mask** — `ntl.load` 用 `mask` 和 `other=float("-inf")` 处理边界,避免 `exp` 溢出 +3. **BLOCK_SIZE** — 必须是输入最后一维的大小,以确保一次 load 完整行 + +## Online Softmax 算法 + +``` +m_0 = max(0, x[0:BLOCK_SIZE]) +d_0 = exp(x[0:BLOCK_SIZE] - m_0) +m_prev = m_0; d_prev = sum(d_0) +output[0:BLOCK_SIZE] = exp(x[0:BLOCK_SIZE] - m_0) / d_prev +``` + +## 运行 + +```bash +python examples/reduction_softmax/run.py +python examples/reduction_softmax/benchmark.py +``` + +## 预期结果 + +- 所有 correctness 测试通过 +- 数值精度与 `torch.softmax` 一致(atol=1e-3 for fp16) + +## Torture Test 提示 + +- small shape: `(1, 2)` — 极小的行 +- large shape: `(4, 131072)` — 极大的行 +- non-contiguous: `x[:, ::2]` sliced input +- 极端值: `x.fill_(1000.0)` — 测试数值稳定性 diff --git a/skills/competition/nineops-skills/examples/reduction_softmax/benchmark.py b/skills/competition/nineops-skills/examples/reduction_softmax/benchmark.py new file mode 100644 index 0000000..6475614 --- /dev/null +++ b/skills/competition/nineops-skills/examples/reduction_softmax/benchmark.py @@ -0,0 +1,117 @@ +""" +reduction_softmax/benchmark.py +Softmax kernel 的 benchmark。 +""" + +import torch +from time import perf_counter + +try: + from run import make_softmax, make_softmax_no_autotune +except ImportError: + import sys + sys.path.insert(0, ".") + from run import make_softmax, make_softmax_no_autotune + + +def benchmark(shape, dtype, label, warmup=10, repeats=100): + x = torch.randn(shape, device="cuda", dtype=dtype) + out = torch.empty_like(x) + dim = -1 + + # 选择 kernel: 如果列数 <= 2048 用 autotune,否则用足够大的 BLOCK_SIZE + cols = shape[-1] + if cols <= 2048: + kernel = make_softmax() + kernel_kwargs = {"BLOCK_SIZE": cols} + else: + bs = 1 + while bs < cols: + bs *= 2 + kernel = make_softmax_no_autotune() + kernel_kwargs = {"BS": bs} + + # 预热 + for _ in range(warmup): + kernel(x, out, **kernel_kwargs) + torch.softmax(x, dim=dim) + torch.cuda.synchronize() + + # 测量 kernel + start = perf_counter() + for _ in range(repeats): + kernel(x, out, **kernel_kwargs) + torch.cuda.synchronize() + kernel_ms = (perf_counter() - start) / repeats * 1000 + + # 测量 torch + start = perf_counter() + for _ in range(repeats): + torch.softmax(x, dim=dim) + torch.cuda.synchronize() + torch_ms = (perf_counter() - start) / repeats * 1000 + + speedup = torch_ms / kernel_ms if kernel_ms > 0 else 0 + print(f"| {label:35s} | {kernel_ms:8.3f} | {torch_ms:8.3f} | {speedup:5.2f}x |") + + return {"label": label, "kernel_ms": kernel_ms, "torch_ms": torch_ms, "speedup": speedup} + + +def main(): + print("=" * 75) + print("Softmax Benchmark") + print("=" * 75) + gpu_name = torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU" + print(f"GPU: {gpu_name}") + print(f"CUDA: {torch.version.cuda}") + print() + print(f"| {'Config':35s} | {'本实现(ms)':>8s} | {'PyTorch(ms)':>8s} | {'Speedup':>6s} |") + print(f"|{'-'*35}|{'-'*10}|{'-'*10}|{'-'*8}|") + + # Small: few rows, small cols + benchmark((4, 1024), torch.float32, "(4, 1024) fp32") + benchmark((4, 4096), torch.float32, "(4, 4096) fp32") + + # Medium + benchmark((128, 1024), torch.float32, "(128, 1024) fp32") + benchmark((128, 4096), torch.float32, "(128, 4096) fp32") + + # Large + benchmark((4, 65536), torch.float32, "(4, 65536) fp32") + benchmark((4, 131072), torch.float32, "(4, 131072) fp32") + + # FP16 + benchmark((128, 1024), torch.float16, "(128, 1024) fp16") + benchmark((128, 4096), torch.float16, "(128, 4096) fp16") + + # Non-contiguous + x_big = torch.randn(8, 2048, device="cuda", dtype=torch.float32) + x_nc = x_big[:, ::2] # (8, 1024), non-contiguous + out_nc = torch.empty(8, 1024, device="cuda", dtype=torch.float32) + kernel = make_softmax() + warmup, repeats = 10, 100 + for _ in range(warmup): + kernel(x_nc, out_nc, BLOCK_SIZE=1024) + torch.softmax(x_nc, dim=-1) + torch.cuda.synchronize() + + start = perf_counter() + for _ in range(repeats): + kernel(x_nc, out_nc, BLOCK_SIZE=1024) + torch.cuda.synchronize() + k_ms = (perf_counter() - start) / repeats * 1000 + + start = perf_counter() + for _ in range(repeats): + torch.softmax(x_nc, dim=-1) + torch.cuda.synchronize() + t_ms = (perf_counter() - start) / repeats * 1000 + + print(f"| {'(8,2048)[:,::2] non-contiguous':35s} | {k_ms:8.3f} | {t_ms:8.3f} | {t_ms/k_ms:5.2f}x |") + + print() + print("Done.") + + +if __name__ == "__main__": + main() diff --git a/skills/competition/nineops-skills/examples/reduction_softmax/run.py b/skills/competition/nineops-skills/examples/reduction_softmax/run.py new file mode 100644 index 0000000..d7a28d5 --- /dev/null +++ b/skills/competition/nineops-skills/examples/reduction_softmax/run.py @@ -0,0 +1,156 @@ +""" +reduction_softmax/run.py +Online softmax kernel: reduction_2d 模式的标准实现。 + +算法: Online softmax (numerically stable) + 1. Load BLOCK_SIZE 元素(整行) + 2. 计算 m = max(x), d = exp(x - m) + 3. 累加 sum_d = sum(d) + 4. output = d / sum_d + +测试覆盖: + - fp32/fp16 + - contiguous / non-contiguous + - 各种行数/列数组合 + - 极端值场景 +""" + +import torch +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + + +def make_softmax(): + """ + reduction_2d 模式: 输入 (M, N),沿 N 维做归约。 + tile((1, BLOCK_SIZE)) 保留 M 维,N 维整行处理。 + BLOCK_SIZE 在 kernel 调用时传入(通常取 input.shape[-1])。 + """ + BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return ( + input.tile((1, BLOCK_SIZE)), + output.tile((1, BLOCK_SIZE)), + ) + + def application(input, output): + x = input + m = ntl.max(x, dim=-1, keepdim=True) + d = ntl.exp(x - m) # numerical stable + sum_d = ntl.sum(d, dim=-1, keepdim=True) + output = d / sum_d # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(2, other=float("-inf")), Tensor(2)), + ) + + +def make_softmax_no_autotune(BLOCK_SIZE=1024): + """固定 BLOCK_SIZE 的版本。传入的 BLOCK_SIZE 在 kernel 调用时以 BS=... 传入。""" + BS = Symbol("BS", constexpr=True) + + def arrangement(input, output, BS=BS): + return ( + input.tile((1, BS)), + output.tile((1, BS)), + ) + + def application(input, output): + x = input + m = ntl.max(x, dim=-1, keepdim=True) + d = ntl.exp(x - m) + sum_d = ntl.sum(d, dim=-1, keepdim=True) + output = d / sum_d # noqa: F841 + + return ninetoothed.make( + arrangement, + application, + tensors=(Tensor(2, other=float("-inf")), Tensor(2)), + ) + + +def demo(): + print("=" * 50) + print("Softmax — Reduction 2D 示例") + print("=" * 50) + + kernel = make_softmax() + print("✅ Kernel created") + + # 测试 1: 基本 2D softmax + x = torch.randn(4, 1024, device="cuda") + out = torch.empty_like(x) + kernel(x, out, BLOCK_SIZE=1024) + expected = torch.softmax(x, dim=-1) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Basic softmax failed" + print("✅ Basic softmax (4, 1024) — PASS") + + # 测试 2: 多行 + x = torch.randn(128, 1024, device="cuda") + out = torch.empty_like(x) + kernel(x, out, BLOCK_SIZE=1024) + expected = torch.softmax(x, dim=-1) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Multi-row softmax failed" + print("✅ Multi-row softmax (128, 1024) — PASS") + + # 测试 3: 列数不能被整除(需要 mask 处理) + x_small = torch.randn(4, 999, device="cuda") + kernel_small = make_softmax_no_autotune() + out = torch.empty_like(x_small) + kernel_small(x_small, out, BS=2048) # 大于 shape[-1] 的 BLOCK_SIZE 测试 mask + expected = torch.softmax(x_small, dim=-1) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Uneven col softmax failed" + print("✅ Uneven cols (4, 999) with mask — PASS") + + # 测试 4: 单行 + x = torch.randn(1, 1024, device="cuda") + out = torch.empty_like(x) + kernel(x, out, BLOCK_SIZE=1024) + expected = torch.softmax(x, dim=-1) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Single row softmax failed" + print("✅ Single row (1, 1024) — PASS") + + # 测试 5: FP16 + x_fp16 = torch.randn(4, 1024, device="cuda", dtype=torch.float16) + out = torch.empty_like(x_fp16) + kernel(x_fp16, out, BLOCK_SIZE=1024) + expected = torch.softmax(x_fp16, dim=-1) + assert torch.allclose(out, expected, atol=1e-3, rtol=1e-3), "FP16 softmax failed" + print("✅ FP16 (4, 1024) — PASS") + + # 测试 6: 极端值(大正值,测试数值稳定性) + x_big = torch.full((4, 1024), 1000.0, device="cuda") + out = torch.empty_like(x_big) + kernel(x_big, out, BLOCK_SIZE=1024) + expected = torch.softmax(x_big, dim=-1) + assert torch.allclose(out, expected, atol=1e-3, rtol=1e-3), "Extreme values softmax failed" + print("✅ Extreme values (4, 1024) fill=1000 — PASS") + + # 测试 7: Non-contiguous (strided) + x_nc = torch.randn(8, 2048, device="cuda")[:, ::2] # shape (8, 1024), non-contiguous + kernel_nc = make_softmax() + out = torch.empty(8, 1024, device="cuda") + kernel_nc(x_nc, out, BLOCK_SIZE=1024) + expected = torch.softmax(x_nc, dim=-1) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Non-contiguous softmax failed" + print("✅ Non-contiguous (8, 2048)[:, ::2] — PASS") + + # 测试 8: 质数列数 + x_prime = torch.randn(3, 17, device="cuda") + out = torch.empty_like(x_prime) + kernel_prime = make_softmax_no_autotune() + kernel_prime(x_prime, out, BS=32) + expected = torch.softmax(x_prime, dim=-1) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "Prime cols softmax failed" + print("✅ Prime cols (3, 17) — PASS") + + print() + print("🎉 所有测试通过!") + + +if __name__ == "__main__": + demo() diff --git a/skills/competition/nineops-skills/references/benchmark_patterns.md b/skills/competition/nineops-skills/references/benchmark_patterns.md new file mode 100644 index 0000000..d4cb3ed --- /dev/null +++ b/skills/competition/nineops-skills/references/benchmark_patterns.md @@ -0,0 +1,71 @@ +# Benchmark 设计规范 + +## 设计原则 + +Benchmark 的设计目标是对齐以下 8 个对照要素: + +1. **算子** — 被测试的具体 kernel +2. **输入规模** — shape, dtype, broadcast 配置 +3. **硬件信息** — GPU 型号、CUDA 版本 +4. **布局配置** — tile size, block size, autotune 参数 +5. **Baseline** — PyTorch CUDA 实现(或其他 baseline) +6. **Ninetoothed** — 被测 DSL 生成 kernel +7. **实现差异** — load/store 模式、tile/block 布局选择 +8. **Fallback 情况** — generated source 或 AOT build 中的回退路径 + +## Benchmark 输入规模矩阵 + +| 规模等级 | 典型值 | 说明 | +|----------|--------|------| +| S (small) | (128,), (64, 64) | 测试 kernel launch 开销 | +| M (medium) | (4096,), (256, 768) | 典型推理/训练场景 | +| L (large) | (65536,), (1024, 1024) | 压力测试 | +| XL (extra) | (131072,), (4096, 4096) | 极限规模 | + +## 测量指标 + +| 指标 | 单位 | 说明 | +|------|------|------| +| latency | ms | 单次调用延迟(预热后均值) | +| throughput | GB/s | 有效带宽 | +| TFLOPS | TF/s | 计算吞吐 | +| kernel time | μs | GPU kernel 实际执行时间 | + +## Benchmark 模板 + +```python +import torch +import ninetoothed +from time import perf_counter + +def benchmark_kernel(kernel_fn, torch_fn, *args, warmup=10, repeats=100): + # 预热 + for _ in range(warmup): + kernel_fn(*args) + torch_fn(*args) + torch.cuda.synchronize() + + # 测量 ninetoothed + start = perf_counter() + for _ in range(repeats): + kernel_fn(*args) + torch.cuda.synchronize() + kernel_time = (perf_counter() - start) / repeats + + # 测量 torch baseline + start = perf_counter() + for _ in range(repeats): + torch_fn(*args) + torch.cuda.synchronize() + torch_time = (perf_counter() - start) / repeats + + return {"kernel_ms": kernel_time * 1000, "torch_ms": torch_time * 1000} +``` + +## 报告格式 + +所有 benchmark 结果记录为 Markdown 表格,包含: + +| Operator | Shape | Dtype | BlockSize | Kernel(ms) | Torch(ms) | Speedup | +|----------|-------|-------|-----------|------------|-----------|---------| +| add | (1024,) | fp32 | 1024 | 0.012 | 0.015 | 1.25x | diff --git a/skills/competition/nineops-skills/references/dsl_patterns.md b/skills/competition/nineops-skills/references/dsl_patterns.md new file mode 100644 index 0000000..3261927 --- /dev/null +++ b/skills/competition/nineops-skills/references/dsl_patterns.md @@ -0,0 +1,130 @@ +# 常见算子 DSL 实现模式 + +## 1. Element-wise 1D(广播 + 全 tile) + +``` +input.tile((BLOCK_SIZE,)) → output.tile((BLOCK_SIZE,)) +``` + +- 适用于:add, mul, sub, div, silu, gelu, relu +- 所有输入输出沿最后一维均匀分块 +- BLOCK_SIZE 通常为 1024 或 2048 + +### 非连续支持(重要) + +**问题**:用 `flatten().tile((BLOCK_SIZE,))` 会破坏非连续张量的 strides, +导致 `torch.empty(N, M).t()` 等转置张量的输出数据错乱。 + +**方案**:对多维度张量,**不 flatten**,保留原始维度数,只 tile 最后一维: + +```python +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + tile_shape = (1,) * (input.ndim - 1) + (BLOCK_SIZE,) + return input.tile(tile_shape), output.tile(tile_shape) +``` + +同时将 `Tensor(1)` 改为 `Tensor(ndim)` 以匹配实际维度。 + +### AST 跟踪约束 + +`application()` 中的 Python 代码会被 AST 跟踪并嵌入 Triton 代码,有以下限制: +- **禁止** `math.*`、`torch.*` 等 Python 模块调用 +- **禁止**模块级变量引用(变量名被原样嵌入导致 NameError) +- **禁止** `**` 运算符(Triton tensor 无 `__pow__`) +- **改用字面量**:`0.7978845608028654` 而非 `math.sqrt(2.0 / math.pi)` +- `x ** 3` → `x * x * x` + +### GELU 激活函数 + +```python +# 近似版(tanh 公式,ntl.exp 手动实现 tanh) +def application(input, output): + t = 0.7978845608028654 * (input + 0.044715 * input * input * input) + exp_t = ntl.exp(t); exp_neg_t = ntl.exp(-t) + output = 0.5 * input * (1.0 + (exp_t - exp_neg_t) / (exp_t + exp_neg_t)) # noqa: F841 + +# 精确版(erf 公式) +def application(input, output): + output = input * 0.5 * (1.0 + ntl.erf(input / ntl.sqrt(2.0))) # noqa: F841 +``` + +## 2. 行归约 2D + +``` +input.tile((1, BLOCK_SIZE)) → output.tile((1, BLOCK_SIZE)) +``` + +- 适用于:softmax, rms_norm, layer_norm +- 保留第一维(batch/rows),沿第二维做归约 +- BLOCK_SIZE 取 `input.shape[-1]` 以覆盖整行 + +## 3. Matmul 2D + +``` +input: tile((BM, BK)) → tile((1, -1)) → expand(-1, N_blocks) → squeeze(0) +other: tile((BK, BN)) → tile((-1, 1)) → expand(M_blocks, -1) → squeeze(1) +output: tile((BM, BN)) +``` + +- 三个 block_size 符号(BM, BN, BK)由 autotune 搜索 +- application 中用 `ntl.dot` + 循环累加 + +## 4. BMM(批矩阵乘) + +``` +input: tile((1, BM, BK)) → tile((1, 1, -1)) → expand(-1, -1, N_blocks) → squeeze(1) +other: tile((1, BK, BN)) → tile((1, -1, 1)) → expand(-1, M_blocks, -1) → squeeze(2) +output: tile((1, BM, BN)) +``` + +- 与 Matmul 2D 的区别:多一个 batch 维度 +- 多分支前缀 `(1, ...)` + +## 5. RoPE(stride-dilation 模式) + +``` +input: tile((1, 1, BLOCK_SIZE)) → tile((1, -1, -1)) + → expand(-1, 2*cos.shape[0], -1) → squeeze(1) → squeeze(0) # pass + → tile((-1, -1)) → tile((1, DILATION)) # query + +cos/sin: tile((1, BLOCK_SIZE)) → tile((-1, -1)) → expand(...) +``` + +- 使用 `tile((1, DILATION))` 处理 stride 跳跃访问 +- cos/sin 用 `expand` 广播到 query 的维度 +- 多分支 kernel(多个 kernel 对象由同一个 make 产生) + +## 6. MaxPool2D(滑动窗口) + +``` +input: tile((1, BLOCK_SIZE, 1, 1)) → tile((1, -1, BLOCK_SIZE_H, BLOCK_SIZE_W)) + → tile((1, -1, -1, -1)) → expand(-1, -1, -1, input.shape[3]) +``` + +- 固定 batch=1,在 H 和 W 维度滑动 +- 外层 tile 为 1(保留),内层按窗口大小平铺 + +## 7. Attention(Flash Attention online softmax) + +``` +query: tile((1, BLOCK_SIZE_M, BLOCK_SIZE_K)) +key: tile((1, BLOCK_SIZE_K, BLOCK_SIZE_N)) +value: tile((1, BLOCK_SIZE_N, BLOCK_SIZE_K)) +output:tile((1, BLOCK_SIZE_M, BLOCK_SIZE_K)) +``` + +- application 中使用 ntl.dot 计算 score +- online softmax 维护 m_i, d_i 状态变量 +- loop over key/value blocks + +## 通用模式总结 + +``` +单输入 1D: Tensor(0 or 1) → tile((N,)) +双输入 1D: Tensor(0 or 1) × 2 + Tensor(1) → tile((N,)) +行归约 2D: Tensor(2) × 2 → tile((1, N)) +Matmul: Tensor(2) × 3 → MM 分块布局 +BMM: Tensor(3) × 3 → BMM 分块布局 +RoPE: Tensor(2 or 3) × multi → stride-dilation +Attention: Tensor(3) × 4 → online softmax loop +``` diff --git a/skills/competition/nineops-skills/references/failure_diagnosis.md b/skills/competition/nineops-skills/references/failure_diagnosis.md new file mode 100644 index 0000000..34dcfa0 --- /dev/null +++ b/skills/competition/nineops-skills/references/failure_diagnosis.md @@ -0,0 +1,158 @@ +# 常见故障诊断指南 + +## 问题分类 + +### 1. Correctness 失败 + +**现象**: kernel 输出与 PyTorch baseline 不 match + +**诊断步骤**: + +1. **检查 dtype** — ntl 中是否做了类型提升? + ```python + # 好的做法:显式 cast 到 fp32 计算 + input_f32 = ntl.cast(input, ntl.float32) + ``` + +2. **检查 tile 覆盖** — BLOCK_SIZE 能否整除总元素数? + ``` + 总元素数 % BLOCK_SIZE != 0 时 → 需要 mask 处理边界 + ``` + +3. **检查 shape 推导** — arrangement 中 shape 是否匹配? + ```python + # 打印 shape 调试 + print("input shape:", input.shape) + print("output shape:", output.shape) + ``` + +4. **检查 broadcast** — 广播维度是否正确 expand? + ```python + # 常见错误:直接 tile 而不是先 tile 再 expand + ``` + +5. **检查 stride** — non-contiguous 的 ptr 偏移是否正确? + +### 2. Crash / CUDA Error + +**现象**: kernel 运行时报错(segfault, illegal memory access) + +**诊断步骤**: + +1. **检查 mask** — 边界 tile 是否有 mask? +2. **检查 other** — mask 命中区间的填充值是否合理? + ```python + other=0 vs other=float("-inf") # softmax 需要 -inf + ``` +3. **缩小规模** — 用 (1,) 或 (2, 2) 最小 shape 先验证不 crash +4. **检查 Tensor 声明** — `Tensor(1)` 是否正确反映维度数? + +### 3. Performance 不达标 + +**现象**: kernel 比 PyTorch baseline 慢 + +**诊断步骤**: + +1. **检查 BLOCK_SIZE** — 是否过小或过大? + - 过小 → launch overhead 高 + - 过大 → 占用率高,warp 利用率低 +2. **检查 load/store 模式** — 是否为合并访问? +3. **检查 autotune** — 是否使用 `block_size()` meta 符号? +4. **检查 generated source** — 是否有多余的循环或同步? + +### 4. Compile 失败 + +**现象**: `make()` 或 `ninetoothed.make` 报错 + +**诊断步骤**: + +1. **检查符号传递** — 所有 Symbol 是否都在 arrangement 和 application 中正确接收? +2. **检查 Tensor 数量** — tensors 元组长度是否匹配 arrangement/application 参数? +3. **检查 layout 展开** — 是否存在不支持的 tile 组合? + ```python + # 用 debug 模式查看展开过程 + import ninetoothed; ninetoothed.set_debug(True) + ``` + +## 故障排查速查表 + +| 症状 | 最可能原因 | 检查点 | +|------|-----------|--------| +| 数值不匹配 | dtype/精度问题 | ntl.cast, atol/rtol | +| 部分元素错误 | 边界 tile 未处理 | mask, other | +| 全部为 0 | store 未执行 | output 赋值 | +| Crash | 越界访问 | mask, BLOCK_SIZE | +| 慢 | BLOCK_SIZE 不当 | autotune | +| Compile fail | 布局不可展开 | Tensor 声明 | + +## 5. AST 跟踪错误(Nineteethed DSL 特有) + +**现象**: kernel 编译成功但运行时 NameError / AttributeError + +**诊断步骤**: + +1. **检查 application 中的 Python 模块引用** — `math.*` 是否出现在 application() 函数内? + ```python + # 错误示例(在 application 内) + def application(input, output): + t = ntl.sqrt(2.0 / math.pi) * input # ❌ NameError: name 'math' is not defined + ``` + +2. **检查 application 中的模块级变量** — 变量名被 AST 跟踪原样嵌入? + ```python + _SQRT_2_OVER_PI = 0.7978845608028654 + def application(input, output): + t = _SQRT_2_OVER_PI * input # ❌ NameError: name '_SQRT_2_OVER_PI' is not defined + ``` + +3. **检查 `**` 运算符** — Triton tensor 不支持 `__pow__` + ```python + # ❌ 错误 + t = input ** 3 # AttributeError: 'tensor' object has no attribute '__pow__' + # ✅ 正确 + t = input * input * input + ``` + +4. **检查 `ntl.tanh` 等不存在的 API** — 手动用 `ntl.exp` 组合 + ```python + # ❌ 不存在 + output = ntl.tanh(t) + # ✅ 手动实现 + exp_t = ntl.exp(t); exp_neg_t = ntl.exp(-t) + output = (exp_t - exp_neg_t) / (exp_t + exp_neg_t) + ``` + +**根因**: Nineteethed 的 AST 跟踪机制将 application() 内的 Python 代码转换为 Triton 代码, +但 Triton 编译环境中不存在标准 Python 库(math, torch, numpy)和 Python 模块级变量。 + +**解决方案**: `application()` 中只使用:`ntl.*` 函数、字面量数值、四则运算符。 + +## 6. 非连续张量 Correctness 失败 + +**现象**: kernel 对 contiguous 张量正确,但对 `.T` 或 `.t()` 转置张量数值错误 + +**根因**: arrangement 中使用了 `flatten()`,破坏了原始 strides,导致 Triton 按连续 strides +计算 ptr 偏移,在非连续张量上读写错位。 + +**诊断**: 检查 arrangement 是否调用了 `.flatten()`。 + +**修复**: 改用 preserve-ndim tile 模式: +```python +# ❌ 错误:flatten 破坏 strides +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.flatten().tile((BLOCK_SIZE,)), output.flatten().tile((BLOCK_SIZE,)) + +# ✅ 正确:保留 ndim,只 tile 最后一维 +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + tile_shape = (1,) * (input.ndim - 1) + (BLOCK_SIZE,) + return input.tile(tile_shape), output.tile(tile_shape) +``` + +同时确保 `Tensor(ndim)` 的 ndim 与实际传入张量的维度一致。 + +## 日志收集命令 + +```bash +# 收集所有调试信息 +python .skill/scripts/collect_task_log.py --output diagnose_log/ +``` diff --git a/skills/competition/nineops-skills/references/generated_source_and_aot.md b/skills/competition/nineops-skills/references/generated_source_and_aot.md new file mode 100644 index 0000000..a25ba69 --- /dev/null +++ b/skills/competition/nineops-skills/references/generated_source_and_aot.md @@ -0,0 +1,95 @@ +# Generated Source 检查与 AOT Build 诊断 + +## 为什么要检查 generated source? + +Ninetoothed 将 DSL 定义编译为 Triton 代码。检查生成的 Triton source 可以: + +1. 验证 arrangement 是否正确展开为预期的 tile/block 模式 +2. 发现 load/store 模式的低效问题(如非合并访问) +3. 检查 AOT build 是否有 fallback 路径 +4. 确认 autotune 参数是否被正确注入 + +## 查看 generated source + +```python +import ninetoothed +from ninetoothed.codegen.source import get_generated_source + +# 假设 kernel 已创建 +source = get_generated_source(kernel, ...) +print(source) +``` + +或在命令行: + +```bash +# 使用 inspect 脚本 +bash .skill/scripts/inspect_generated_source.sh +``` + +## 常见检查点 + +### 1. Load 指令 + +检查点: +- 是否使用 `mask` 处理边界? +- ptr 计算是否按 stride/offset 正确推导? +- 是否有非合并访问(stride > 1 的维度的连续 load)? + +✅ 好的 load: + +```python +tl.load(input_ptrs + offsets, mask=mask, other=0.0) +``` + +❌ 有问题的 load: + +```python +# 缺少 mask,可能导致越界 +tl.load(input_ptrs + offsets) +``` + +### 2. Store 指令 + +检查点: +- 是否有 output 的 store 操作? +- mask 是否正确? + +### 3. Tile/Block 循环 + +检查点: +- 外层循环是否遍历了正确的维度? +- 内层 dot 累加是否在 K 维度循环? + +### 4. AOT build 日志 + +```bash +# 查看 AOT compilation 输出 +python -c " +import ninetoothed +ninetoothed.make(..., aot=True) +" 2>&1 | tee aot_build.log +``` + +常见 AOT build 问题: + +| 问题 | 表现 | 解决方案 | +|------|------|----------| +| Triton 编译失败 | `CUDA error: invalid configuration` | 调整 BLOCK_SIZE | +| 内存越界 | `tl.load` 访问超出分配范围 | 检查 mask / other 填充值 | +| dtype 不匹配 | load/store 类型不一致 | 显式 cast | +| Autotune 超时 | 搜索空间过大 | 缩小 block_size 范围 | + +## 5. Fallback 诊断 + +当 detected fallback 时: + +```bash +# 检查 fallback 触发条件 +NINETOOTHED_DEBUG=1 python test_script.py +``` + +常见 fallback: +- Triton 不支持的操作 → 回退到 PyTorch +- 未安装 CUDA → 回退到 CPU +- 布局无法展开 → 回退到 element-wise 逐元素 diff --git a/skills/competition/nineops-skills/references/repo_index.md b/skills/competition/nineops-skills/references/repo_index.md new file mode 100644 index 0000000..e955a96 --- /dev/null +++ b/skills/competition/nineops-skills/references/repo_index.md @@ -0,0 +1,72 @@ +# Ninetoothed 仓库代码检索索引 + +## 核心仓库结构 + +``` +ninetoothed/ +├── ninetoothed/ +│ ├── __init__.py # 公开 API:make, Symbol, Tensor, block_size +│ ├── language/ +│ │ ├── __init__.py # ntl 语言 API 导出 +│ │ ├── core.py # 核心操作 (load, store, cast, etc.) +│ │ └── math.py # 数学操作 (sigmoid, tanh, exp, etc.) +│ ├── ir/ +│ │ ├── __init__.py +│ │ ├── symbol.py # Symbol 实现 +│ │ ├── tensor.py # Tensor 声明与 shape 推导 +│ │ ├── arrangement.py # Arrangement (tile/expand/squeeze) +│ │ └── application.py # Application 计算定义 +│ ├── codegen/ +│ │ ├── __init__.py +│ │ ├── triton.py # Triton codegen 入口 +│ │ └── source.py # 源码生成与检查 +│ ├── autotune/ +│ │ └── __init__.py # 自动调优 (block_size meta 符号) +│ └── testing/ +│ └── __init__.py # 测试辅助 +``` + +## 关键 API 定位 + +| 符号/API | 文件 | 行号 | 说明 | +|----------|------|------|------| +| `Symbol` | `ir/symbol.py` | — | 编译时常量符号 | +| `block_size` | `autotune/__init__.py` | — | 可 autotune 的 meta 符号 | +| `Tensor(n)` | `ir/tensor.py` | — | n 维张量声明 | +| `make()` | `__init__.py` | — | 构建 kernel 主入口 | +| `ntl.load` | `language/core.py` | — | 从指针加载数据 | +| `ntl.store` | `language/core.py` | — | 将数据写入指针 | +| `ntl.cast` | `language/core.py` | — | 类型转换 | +| `ntl.sigmoid` | `language/math.py` | — | sigmoid 激活 | +| `ntl.zeros` | `language/core.py` | — | 零初始化 | +| `tile()` | `ir/arrangement.py` | — | 数据分块 | +| `expand()` | `ir/arrangement.py` | — | 维度广播扩展 | +| `squeeze()` | `ir/arrangement.py` | — | 移除单维度 | + +## Dot / Matmul 相关 + +| API | 文件 | 说明 | +|-----|------|------| +| `ntl.dot` | `language/core.py` | 矩阵乘法片段 | +| `ntl.softmax` | `language/math.py` | online softmax 原语 | + +## 示例算子索引 + +| 算子 | 路径 | DSL 模式 | +|------|------|----------| +| RoPE | `ops/ninetoothed/kernels/rotary_position_embedding.py` | stride-dilation + 多分支 kernel | +| Scaled Dot-Product Attn | `ops/ninetoothed/kernels/scaled_dot_product_attention.py` | Flash Attention online softmax | +| 通用 ops | `ops/ninetoothed/torch.py` | Torch 包装层 | + +## 搜索技巧 + +```bash +# 查找 ntl 所有导出函数 +grep -rn "def " ninetoothed/language/ --include="*.py" + +# 查找 make() 的使用 +grep -rn "ninetoothed.make" examples/ --include="*.py" + +# 查找 Symbol 的使用 +grep -rn "Symbol(" examples/ --include="*.py" +``` diff --git a/skills/competition/nineops-skills/references/testing_patterns.md b/skills/competition/nineops-skills/references/testing_patterns.md new file mode 100644 index 0000000..750ac85 --- /dev/null +++ b/skills/competition/nineops-skills/references/testing_patterns.md @@ -0,0 +1,118 @@ +# Correctness Test 编写规范 + +## 测试覆盖维度 + +每个 correctness test **必须**覆盖以下维度: + +### 1. dtype + +| 测试 | 说明 | +|------|------| +| float16 (fp16) | 半精度基础测试 | +| float32 (fp32) | 单精度基础测试 | +| bfloat16 (bf16) | BF16 精度测试 | + +> 如果算子不支持某些 dtype,在 test plan 中明确标注。 + +### 2. shape + +| 测试 | 说明 | +|------|------| +| 最小 shape | 如 (1,), (1, 1) | +| 典型 shape | 如 (1024,), (256, 768) | +| 大 shape | 如 (131072,) 测试稳定性 | +| 非均匀 shape | 如 (3, 7, 11) 质数维度 | + +### 3. broadcast + +| 测试 | 说明 | +|------|------| +| scalar broadcast | `x + 2.0` | +| vector broadcast | `x + y` where y shape=(1,) | +| matrix broadcast | `x + y` where y shape=(1, N) | +| 3D broadcast | `x + y` where y shape=(1, 1, N) | +| 全广播(不同 ndim) | `x.shape=(3,1,5), y.shape=(1,4,1)` | + +### 4. stride / contiguity + +| 测试 | 说明 | +|------|------| +| contiguous | 标准连续布局 | +| transposed | `.T` 转置 | +| sliced | `x[::2]` 等间距切片 | +| view | `.view()` 变形 | +| expanded | `.expand()` 扩展 | +| permuted | `.permute()` 重排 | +| non-contiguous | `torch.empty(N, M).t()` | + +## ndim 参数化测试 + +对支持任意维度输入(ndim)的算子,使用参数化测试: + +```python +@pytest.mark.parametrize("ndim", [1, 2, 3]) +def test_with_different_ndim(ndim): + """测试不同维度的 element-wise 算子""" + if ndim == 1: + x = torch.randn(4096, device="cuda") + elif ndim == 2: + x = torch.randn(64, 64, device="cuda") + elif ndim == 3: + x = torch.randn(16, 16, 16, device="cuda") + + kernel = make_relu(ndim=ndim) + out = torch.empty_like(x) + kernel(x, out, BLOCK_SIZE=1024) + expected = torch.relu(x) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5) +``` + +## 非连续张量测试 + +对 element-wise 算子,必须测试非连续张量场景: + +```python +def test_non_contiguous(): + """测试 2D 转置(非连续)张量""" + x = torch.randn(128, 256, device="cuda").t() # shape (256, 128),非连续 + out = torch.empty_like(x) + kernel = make_relu(ndim=2) # 使用 ndim=2 支持非连续 + kernel(x, out, BLOCK_SIZE=256) + expected = torch.relu(x) + assert torch.allclose(out, expected, atol=1e-5, rtol=1e-5), "非连续张量测试失败" +``` + +## 测试断言标准 + +```python +# 绝对误差容限 +atol = 1e-3 if dtype == torch.float16 else 1e-5 +rtol = 1e-3 if dtype == torch.float16 else 1e-5 +torch.allclose(output, expected, atol=atol, rtol=rtol) +``` + +## 测试文件组织 + +``` +tests/ +├── test_broadcast_add.py # Task 1 +├── test_softmax.py # Task 2 +├── test_non_contiguous.py # Task 3 +├── test_benchmark.py # Task 4 +└── conftest.py # 共享 fixture 和辅助函数 +``` + +## conftest.py 推荐内容 + +```python +import torch +import pytest + +@pytest.fixture +def dtype_fixture(request): + return request.param if hasattr(request, 'param') else torch.float32 + +@pytest.fixture +def device(): + return torch.device("cuda" if torch.cuda.is_available() else "cpu") +``` diff --git a/skills/competition/nineops-skills/scripts/collect_task_log.py b/skills/competition/nineops-skills/scripts/collect_task_log.py new file mode 100644 index 0000000..8d80108 --- /dev/null +++ b/skills/competition/nineops-skills/scripts/collect_task_log.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +collect_task_log.py — 收集测试和 benchmark 日志,统一归档。 + +用法: + python collect_task_log.py --output ./diagnose/ + python collect_task_log.py --task broadcast_add --format json + +功能: + 1. 运行 correctness test,收集输出 + 2. 运行 benchmark,收集结果 + 3. 检查 generated source 并保存 + 4. 汇总成诊断报告 +""" + +import argparse +import json +import os +import subprocess +import sys +from datetime import datetime + + +def run_command(cmd: list, timeout: int = 120) -> dict: + """运行命令并返回输出。""" + try: + result = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=timeout, + ) + return { + "stdout": result.stdout, + "stderr": result.stderr, + "returncode": result.returncode, + "success": result.returncode == 0, + } + except subprocess.TimeoutExpired: + return {"stdout": "", "stderr": "TIMEOUT", "returncode": -1, "success": False} + except FileNotFoundError: + return {"stdout": "", "stderr": "Command not found", "returncode": -1, "success": False} + + +def collect_task(task_name: str, script_path: str, output_dir: str): + """收集单个任务的日志。""" + print(f"收集: {task_name}") + result = run_command(["python", script_path]) + + log_dir = os.path.join(output_dir, task_name) + os.makedirs(log_dir, exist_ok=True) + + # 保存 stdout + with open(os.path.join(log_dir, "stdout.log"), "w", encoding="utf-8") as f: + f.write(result["stdout"]) + + # 保存 stderr + with open(os.path.join(log_dir, "stderr.log"), "w", encoding="utf-8") as f: + f.write(result["stderr"]) + + # 保存 summary + summary = { + "task": task_name, + "timestamp": datetime.now().isoformat(), + "success": result["success"], + "returncode": result["returncode"], + "stdout_lines": len(result["stdout"].splitlines()), + "stderr_lines": len(result["stderr"].splitlines()), + } + with open(os.path.join(log_dir, "summary.json"), "w", encoding="utf-8") as f: + json.dump(summary, f, indent=2, ensure_ascii=False) + + if result["success"]: + print(f" ✅ 成功 ({summary['stdout_lines']} 行输出)") + else: + print(f" ⚠️ 失败 (exit={result['returncode']})") + if result["stderr"][:500]: + print(f" stderr: {result['stderr'][:500]}") + + return summary + + +def generate_report(output_dir: str, summaries: list): + """生成汇总诊断报告。""" + lines = [ + "# 任务日志诊断报告", + "", + f"生成时间: {datetime.now().isoformat()}", + f"工作目录: {os.getcwd()}", + "", + "## 摘要", + "", + "| 任务 | 状态 | 输出行数 |", + "|------|------|----------|", + ] + + success_count = 0 + for s in summaries: + status = "✅" if s["success"] else "❌" + if s["success"]: + success_count += 1 + lines.append(f"| {s['task']} | {status} | {s['stdout_lines']} |") + + lines.extend([ + "", + f"总计: {len(summaries)} 个任务, {success_count} 个成功, {len(summaries) - success_count} 个失败", + "", + ]) + + with open(os.path.join(output_dir, "report.md"), "w", encoding="utf-8") as f: + f.write("\n".join(lines)) + + print(f"\n诊断报告: {os.path.join(output_dir, 'report.md')}") + + +def main(): + parser = argparse.ArgumentParser(description="收集并归档 task 日志") + parser.add_argument( + "--output", + default="diagnose_log", + help="输出目录", + ) + parser.add_argument( + "--task", + default=None, + help="只收集特定 task (broadcast_add, softmax 等)", + ) + parser.add_argument( + "--format", + choices=["text", "json"], + default="text", + help="输出格式", + ) + args = parser.parse_args() + + skill_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + examples_dir = os.path.join(skill_dir, "examples") + output_dir = os.path.abspath(args.output) + + os.makedirs(output_dir, exist_ok=True) + + # 定义要收集的任务 + all_tasks = [ + ("broadcast_add", os.path.join(examples_dir, "elementwise_broadcast_add", "run.py")), + ("softmax", os.path.join(examples_dir, "reduction_softmax", "run.py")), + ("non_contiguous", os.path.join(examples_dir, "non_contiguous_stride_case", "run.py")), + ("performance_regression", os.path.join(examples_dir, "performance_regression_case", "run.py")), + ] + + summaries = [] + + for task_name, script_path in all_tasks: + if args.task and args.task not in task_name: + continue + if os.path.exists(script_path): + summary = collect_task(task_name, script_path, output_dir) + summaries.append(summary) + else: + print(f"跳过: {task_name} ({script_path} 不存在)") + + if summaries: + generate_report(output_dir, summaries) + + if args.format == "json": + json_path = os.path.join(output_dir, "summary_all.json") + with open(json_path, "w", encoding="utf-8") as f: + json.dump(summaries, f, indent=2, ensure_ascii=False) + print(f"JSON 汇总: {json_path}") + + print(f"\n所有日志已保存到: {output_dir}") + + +if __name__ == "__main__": + main() diff --git a/skills/competition/nineops-skills/scripts/inspect_generated_source.sh b/skills/competition/nineops-skills/scripts/inspect_generated_source.sh new file mode 100644 index 0000000..33016d6 --- /dev/null +++ b/skills/competition/nineops-skills/scripts/inspect_generated_source.sh @@ -0,0 +1,107 @@ +#!/bin/bash +# inspect_generated_source.sh — 查看 ninetoothed 生成的 Triton source +# +# 用法: +# bash scripts/inspect_generated_source.sh # 交互选择 +# bash scripts/inspect_generated_source.sh broadcast_add # 直接指定 +# bash scripts/inspect_generated_source.sh --save output.txt # 保存到文件 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(dirname "$SCRIPT_DIR")" +EXAMPLES_DIR="$SKILL_DIR/examples" + +SAVE_FILE="" +TARGET="" + +for arg in "$@"; do + case $arg in + --save) + shift + SAVE_FILE="$1" + shift + ;; + *) + TARGET="$arg" + break + ;; + esac +done + +echo "==========================================" +echo " Generated Source Inspector" +echo "==========================================" +echo "" + +inspect_source() { + local label="$1" + local script="$2" + + echo "--- $label ---" + + if [ ! -f "$script" ]; then + echo "⚠️ 跳过: $script 不存在" + return + fi + + # 运行脚本并添加环境变量来启用 debug 输出 + local output + output=$(NINETOOTHED_DEBUG=1 python "$script" 2>&1 || true) + + # 尝试提取 generated source 部分 + local source + source=$(echo "$output" | awk '/Generated Source:/,/^$/') + + if [ -z "$source" ]; then + source=$(echo "$output" | awk '/TRITON_KERNEL/,/^END_KERNEL/') + fi + + if [ -z "$source" ]; then + source="$output" # fallback: 显示全部 + fi + + echo "$source" + echo "" + + if [ -n "$SAVE_FILE" ]; then + { + echo "=== $label ===" + echo "$source" + echo "" + } >> "$SAVE_FILE" + fi +} + +if [ -n "$TARGET" ]; then + case "$TARGET" in + broadcast_add|add|elementwise) + inspect_source "Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/run.py" + ;; + softmax) + inspect_source "Softmax" "$EXAMPLES_DIR/reduction_softmax/run.py" + ;; + non_contiguous) + inspect_source "Non-contiguous Add" "$EXAMPLES_DIR/non_contiguous_stride_case/run.py" + ;; + regression) + inspect_source "Performance Regression" "$EXAMPLES_DIR/performance_regression_case/run.py" + ;; + *) + echo "错误: 未知 target '$TARGET'" + echo "可用: broadcast_add, softmax, non_contiguous, regression" + exit 1 + ;; + esac +else + inspect_source "Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/run.py" + inspect_source "Softmax" "$EXAMPLES_DIR/reduction_softmax/run.py" + inspect_source "Non-contiguous Add" "$EXAMPLES_DIR/non_contiguous_stride_case/run.py" + inspect_source "Performance Regression" "$EXAMPLES_DIR/performance_regression_case/run.py" +fi + +echo "---" +if [ -n "$SAVE_FILE" ]; then + echo "已保存到: $SAVE_FILE" +fi +echo "Inspect Done." diff --git a/skills/competition/nineops-skills/scripts/run_benchmark.sh b/skills/competition/nineops-skills/scripts/run_benchmark.sh new file mode 100644 index 0000000..57116a4 --- /dev/null +++ b/skills/competition/nineops-skills/scripts/run_benchmark.sh @@ -0,0 +1,120 @@ +#!/bin/bash +# run_benchmark.sh — 运行 benchmark 套件 +# +# 用法: +# bash scripts/run_benchmark.sh # 运行所有 benchmark +# bash scripts/run_benchmark.sh softmax # 运行特定 benchmark +# bash scripts/run_benchmark.sh --output results.md # 输出到文件 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(dirname "$SCRIPT_DIR")" +EXAMPLES_DIR="$SKILL_DIR/examples" + +# 解析参数 +OUTPUT_FILE="" +VERBOSE="" +TARGET="" + +for arg in "$@"; do + case $arg in + --output) + shift + OUTPUT_FILE="$1" + shift + ;; + --verbose) + VERBOSE="--verbose" + shift + ;; + *) + TARGET="$arg" + break + ;; + esac +done + +echo "==========================================" +echo " Ninetoothed Benchmark Runner" +echo "==========================================" +echo "Output: ${OUTPUT_FILE:-stdout}" +echo "" + +benchmark_results="" + +run_bench() { + local label="$1" + local script="$2" + + echo "" + echo "--- Benchmark: $label ---" + + if [ -f "$script" ]; then + local output + output=$(python "$script" 2>&1) + echo "$output" + + if [ -n "$OUTPUT_FILE" ]; then + benchmark_results+=" +## $label + +\`\`\` +$output +\`\`\` +" + fi + else + echo "⚠️ 跳过: $script 不存在" + fi +} + +if [ -n "$TARGET" ]; then + # 运行特定案例 + case "$TARGET" in + elementwise|broadcast_add|add) + run_bench "Element-wise Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/benchmark.py" + ;; + softmax|reduction) + run_bench "Reduction Softmax" "$EXAMPLES_DIR/reduction_softmax/benchmark.py" + ;; + non_contiguous) + run_bench "Non-contiguous Stride Case" "$EXAMPLES_DIR/non_contiguous_stride_case/benchmark.py" + ;; + regression) + run_bench "Performance Regression Case" "$EXAMPLES_DIR/performance_regression_case/benchmark.py" + ;; + *) + echo "错误: 未知 target '$TARGET'" + echo "可用: elementwise, softmax, non_contiguous, regression" + exit 1 + ;; + esac +else + # 运行所有 + run_bench "Element-wise Broadcast Add" "$EXAMPLES_DIR/elementwise_broadcast_add/benchmark.py" + run_bench "Reduction Softmax" "$EXAMPLES_DIR/reduction_softmax/benchmark.py" + run_bench "Non-contiguous Stride Case" "$EXAMPLES_DIR/non_contiguous_stride_case/benchmark.py" + run_bench "Performance Regression Case" "$EXAMPLES_DIR/performance_regression_case/benchmark.py" +fi + +# 写入输出文件 +if [ -n "$OUTPUT_FILE" ]; then + { + echo "# Benchmark Results" + echo "" + echo "Date: $(date)" + echo "Host: $(hostname 2>/dev/null || echo 'unknown')" + echo "GPU: $(python -c 'import torch; print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else \"CPU\")' 2>/dev/null || echo 'unknown')" + echo "" + echo "$benchmark_results" + } > "$OUTPUT_FILE" + + echo "" + echo "结果已保存到: $OUTPUT_FILE" +fi + +echo "" +echo "==========================================" +echo " Benchmark Done." +echo "==========================================" diff --git a/skills/competition/nineops-skills/scripts/run_correctness.py b/skills/competition/nineops-skills/scripts/run_correctness.py new file mode 100644 index 0000000..6ca4bef --- /dev/null +++ b/skills/competition/nineops-skills/scripts/run_correctness.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +""" +run_correctness.py — 跨平台正确性测试运行器(Python 实现) + +替代 run_correctness.sh,在 Windows/Linux/Mac 上均可运行。 + +用法: + python scripts/run_correctness.py # 运行所有测试 + python scripts/run_correctness.py softmax # 运行 examples 中指定算子的测试 + python scripts/run_correctness.py --verbose # 详细输出 + python scripts/run_correctness.py --file path/to/test.py # 运行指定文件 +""" + +import argparse +import os +import subprocess +import sys +import time + + +def get_skill_dir(): + """获取 skill 包根目录。""" + script_dir = os.path.dirname(os.path.abspath(__file__)) + return os.path.dirname(script_dir) + + +def discover_example_tests(skill_dir): + """扫描 examples/ 下所有 run.py 和 benchmark.py。""" + examples_dir = os.path.join(skill_dir, "examples") + tests = [] + if os.path.isdir(examples_dir): + for name in os.listdir(examples_dir): + example_dir = os.path.join(examples_dir, name) + if os.path.isdir(example_dir): + for fname in ("run.py", "benchmark.py"): + fpath = os.path.join(example_dir, fname) + if os.path.isfile(fpath): + tests.append((f"{name}/{fname}", fpath)) + return tests + + +def discover_tests_dir_tests(skill_dir): + """扫描 tests/ 目录下的 .py 文件。""" + tests_dir = os.path.join(skill_dir, "tests") + tests = [] + if os.path.isdir(tests_dir): + for fname in sorted(os.listdir(tests_dir)): + if fname.endswith(".py"): + fpath = os.path.join(tests_dir, fname) + tests.append((f"tests/{fname}", fpath)) + return tests + + +def run_test(label, fpath, verbose=False): + """运行单个测试文件并返回结果。""" + if not os.path.isfile(fpath): + return {"label": label, "success": False, "reason": "文件不存在"} + + start = time.time() + try: + result = subprocess.run( + [sys.executable, fpath], + capture_output=True, + text=True, + timeout=300, + ) + elapsed = time.time() - start + success = result.returncode == 0 + + output = result.stdout + if result.stderr: + output += "\n--- stderr ---\n" + result.stderr + + return { + "label": label, + "fpath": fpath, + "success": success, + "returncode": result.returncode, + "output": output, + "elapsed": elapsed, + "reason": None, + } + except subprocess.TimeoutExpired: + elapsed = time.time() - start + return { + "label": label, + "fpath": fpath, + "success": False, + "returncode": -1, + "output": "", + "elapsed": elapsed, + "reason": "TIMEOUT (>300s)", + } + except Exception as e: + return { + "label": label, + "fpath": fpath, + "success": False, + "returncode": -1, + "output": str(e), + "elapsed": 0, + "reason": f"EXCEPTION: {e}", + } + + +def print_result(result, verbose=False): + """格式化输出测试结果。""" + status = "✅" if result["success"] else "❌" + elapsed_s = f"({result['elapsed']:.1f}s)" if result["elapsed"] else "" + print(f" {status} {result['label']} {elapsed_s}") + + if not result["success"]: + if result["reason"]: + print(f" 原因: {result['reason']}") + # 提取关键错误信息 + output = result.get("output", "") + # 只显示最后几行错误 + lines = output.splitlines() + error_lines = [l for l in lines if any( + kw in l.lower() for kw in ["error", "traceback", "fail", "assert"]) + ] + if error_lines: + for l in error_lines[-5:]: + print(f" {l}") + elif verbose and lines: + for l in lines[-10:]: + print(f" {l}") + + if verbose and result["success"]: + lines = result.get("output", "").splitlines() + for l in lines[-5:]: + if l.strip(): + print(f" {l}") + + +def main(): + parser = argparse.ArgumentParser( + description="Ninetoothed Correctness Test Runner (Cross-platform)", + ) + parser.add_argument("target", nargs="?", default=None, + help="要运行的测试名(如 softmax, elementwise_broadcast_add)") + parser.add_argument("--verbose", "-v", action="store_true", + help="详细输出") + parser.add_argument("--file", "-f", type=str, default=None, + help="直接运行指定文件") + args = parser.parse_args() + + skill_dir = get_skill_dir() + + # 收集测试列表 + all_tests = [] + all_tests.extend(discover_example_tests(skill_dir)) + all_tests.extend(discover_tests_dir_tests(skill_dir)) + + if not all_tests: + print("⚠️ 未找到任何测试文件。") + sys.exit(0) + + # 如果指定了 --file,只运行该文件 + if args.file: + test_path = args.file + if not os.path.isabs(test_path): + test_path = os.path.join(skill_dir, test_path) + label = os.path.relpath(test_path, skill_dir) + all_tests = [(label, test_path)] + + # 如果指定了 target 且不是 --file,匹配名称 + if args.target and not args.file: + target_lower = args.target.lower() + matched = [] + for label, fpath in all_tests: + if target_lower in label.lower(): + matched.append((label, fpath)) + if not matched: + print(f"⚠️ 未找到匹配 '{args.target}' 的测试。可用测试:") + for label, _ in all_tests: + print(f" - {label}") + sys.exit(1) + all_tests = matched + + # 运行测试 + print("=" * 56) + print(" Ninetoothed Correctness Test Runner") + print("=" * 56) + print(f" Skill 目录: {skill_dir}") + print(f" 测试数量: {len(all_tests)}") + if args.verbose: + for label, fpath in all_tests: + print(f" - {label} ({fpath})") + print() + + results = [] + for label, fpath in all_tests: + print(f" ▶ 正在运行: {label}") + result = run_test(label, fpath, verbose=args.verbose) + print_result(result, verbose=args.verbose) + results.append(result) + print() + + # 汇总 + passed = sum(1 for r in results if r["success"]) + failed = sum(1 for r in results if not r["success"]) + total = len(results) + + print("=" * 56) + print(f" 结果: ✅ {passed} 通过 | ❌ {failed} 失败 | 共 {total}") + print("=" * 56) + + if failed > 0: + print("\n失败项:") + for r in results: + if not r["success"]: + print(f" ❌ {r['label']}") + print("\n💡 提示: 打开 references/failure_diagnosis.md 查找对应错误的修复方法。") + sys.exit(1) + else: + print("\n🎉 所有测试全部通过!") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/skills/competition/nineops-skills/scripts/run_correctness.sh b/skills/competition/nineops-skills/scripts/run_correctness.sh new file mode 100644 index 0000000..3b72d3d --- /dev/null +++ b/skills/competition/nineops-skills/scripts/run_correctness.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# run_correctness.sh — 运行 correctness 测试套件 +# +# 用法: +# bash scripts/run_correctness.sh # 运行所有 correctness 测试 +# bash scripts/run_correctness.sh test_broadcast_add # 运行特定测试 +# bash scripts/run_correctness.sh --verbose # 详细输出 + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +SKILL_DIR="$(dirname "$SCRIPT_DIR")" +TESTS_DIR="$SKILL_DIR/tests" + +# 解析参数 +VERBOSE="" +TARGET="" + +for arg in "$@"; do + case $arg in + --verbose) + VERBOSE="-v" + shift + ;; + *) + TARGET="$arg" + break + ;; + esac +done + +echo "==========================================" +echo " Ninetoothed Correctness Test Runner" +echo "==========================================" +echo "SKILL_DIR: $SKILL_DIR" +echo "TARGET: ${TARGET:-all}" +echo "" + +# 如果指定了具体测试,尝试多种模式 +if [ -n "$TARGET" ]; then + if [ -f "$TESTS_DIR/${TARGET}.py" ]; then + echo "运行 ${TARGET}.py..." + python "$TESTS_DIR/${TARGET}.py" + elif [ -f "$TESTS_DIR/test_${TARGET}.py" ]; then + echo "运行 test_${TARGET}.py..." + python "$TESTS_DIR/test_${TARGET}.py" + elif [ -f "${TARGET}" ]; then + echo "运行 ${TARGET}..." + python "${TARGET}" + else + echo "错误: 找不到测试 '$TARGET'" + echo "" + echo "可用测试:" + for f in "$TESTS_DIR"/*.py; do + echo " $(basename "$f" .py)" + done + exit 1 + fi +else + # 运行所有以 test_ 开头的文件 + found=0 + for test_file in "$TESTS_DIR"/test_*.py; do + if [ -f "$test_file" ]; then + echo "运行 $(basename "$test_file")..." + python $VERBOSE "$test_file" + echo "" + found=$((found + 1)) + fi + done + + if [ $found -eq 0 ]; then + echo "⚠️ 未找到任何 test_*.py 文件。请先在 tests/ 下创建测试。" + echo "" + echo "运行 self-test tasks(描述性检查):" + python -c " +import os +tasks_path = os.path.join('$TESTS_DIR', 'selftest_tasks.md') +if os.path.exists(tasks_path): + with open(tasks_path, 'r') as f: + print(f.read()) +else: + print('selftest_tasks.md 不存在') +" + else + echo "测试完成: $found 个文件执行。" + fi +fi + +echo "" +echo "==========================================" +echo " Done." +echo "==========================================" diff --git a/skills/competition/nineops-skills/templates/activation.py b/skills/competition/nineops-skills/templates/activation.py new file mode 100644 index 0000000..6a91b1d --- /dev/null +++ b/skills/competition/nineops-skills/templates/activation.py @@ -0,0 +1,65 @@ +""" +激活函数算子模板 +================ +适用于:silu, swiglu, gelu 等激活函数 + +模式特征: +- element-wise,沿最后一维分块,保留 strides(支持非连续张量) +- 使用 ntl.sigmoid, ntl.cast 等 ntl 语言 API +- 通常涉及类型提升到 float32 再计算 +- 使用 _element_wise_arrangement 通用布局替代 1D tile +""" + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + +# ============================================================ +# Step 1: 定义符号 +# ============================================================ +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + +# ============================================================ +# Step 2: 通用 arrangement(保留 strides,支持非连续张量) +# ============================================================ +def _element_wise_arrangement(*tensors, block_size): + ndim = max(tensor.ndim for tensor in tensors) + assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors) + tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,) + return tuple( + tensor.tile(tile_shape) if tensor.ndim != 0 else tensor + for tensor in tensors + ) + + +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return _element_wise_arrangement(input, output, block_size=BLOCK_SIZE) + + +# ============================================================ +# Step 3: 定义 application +# ============================================================ +def application(input, output): + input_loaded = input + # 类型提升到 float32 进行计算 + # + # GELU approximate: 0.5 * x * (1 + tanh(0.79788 * (x + 0.044715 * x^3))) + # x**3 → x*x*x, tanh → (exp(t)-exp(-t))/(exp(t)+exp(-t)) + # GELU exact: x * 0.5 * (1 + erf(x / sqrt(2))) + # Silu: x * sigmoid(x) + # 注意:`**` 运算符不可用(Triton tensor 无 __pow__) + # 注意:模块级变量引用被 AST 原样嵌入 → NameError,用字面量 + output = # noqa: F841 + + +# ============================================================ +# Step 4: 声明 Tensor 元信息(ndim 会展开为具体值) +# ============================================================ +tensors = (Tensor(1), Tensor(1)) + + +# ============================================================ +# Step 5: 创建 kernel +# ============================================================ +kernel = ninetoothed.make(arrangement, application, tensors) diff --git a/skills/competition/nineops-skills/templates/benchmark_report_template.md b/skills/competition/nineops-skills/templates/benchmark_report_template.md new file mode 100644 index 0000000..12d6b15 --- /dev/null +++ b/skills/competition/nineops-skills/templates/benchmark_report_template.md @@ -0,0 +1,56 @@ +# Benchmark 报告模板 + +> 填写 benchmark 测试的全部信息和结果。 + +## 环境信息 + +| 项目 | 值 | +|------|-----| +| GPU 型号 | | +| CUDA 版本 | | +| PyTorch 版本 | | +| Triton 版本 | | +| Ninetoothed 版本 | | + +## 算子 Benchmark + +### 场景 1: 默认输入 + +| Config | 本实现(ms) | PyTorch(ms) | Speedup | 说明 | +|--------|-----------|-------------|---------|------| +| (1024,) fp32 | | | | 小规模 | +| (4096,) fp32 | | | | 中规模 | +| (65536,) fp32 | | | | 大规模 | +| (1024,) fp16 | | | | 半精度 | +| (4096,) fp16 | | | | 中规模半精度 | + +### 场景 2: 广播场景 + +| Broadcast | Shape A | Shape B | 本实现(ms) | PyTorch(ms) | Speedup | +|-----------|---------|---------|-----------|-------------|---------| +| scalar | (4096,) | (1,) | | | | +| vector | (4096,) | (256,) | | | | +| 3D | (4, 128, 256) | (256,) | | | | + +### 场景 3: 非连续访问 + +| Variant | Shape | 本实现(ms) | PyTorch(ms) | Speedup | +|---------|-------|-----------|-------------|---------| +| contiguous | (1024, 1024) | | | | +| transposed | (1024, 1024).T | | | | +| sliced | x[::2, :] | | | | +| expanded | (1, 1024).expand(1024, 1024) | | | | + +## 分析 + +### Load/Store 模式 + + + +### 瓶颈分析 + + + +## 总结 + + diff --git a/skills/competition/nineops-skills/templates/elementwise_1d.py b/skills/competition/nineops-skills/templates/elementwise_1d.py new file mode 100644 index 0000000..1ef3c05 --- /dev/null +++ b/skills/competition/nineops-skills/templates/elementwise_1d.py @@ -0,0 +1,88 @@ +""" +Element-wise 算子模板 +====================== +适用于:add, mul, relu, gelu, silu 等逐元素操作 + +模式特征: +- 所有张量沿最后一维均匀分块,无跨块归约 +- 支持非连续张量(transpose、slice 等),保留原始 strides +- BLOCK_SIZE 作为编译时常量传入 +- 推荐使用工厂函数模式(make_*)以支持动态 ndim + +用法示例: + add_kernel = make_add(ndim=2) + add_kernel(x, y, out, BLOCK_SIZE=1024) + + relu_kernel = make_relu(ndim=1) + relu_kernel(x, out, BLOCK_SIZE=1024) + +# ⚠️ AST 跟踪约束(重要) +# ============================================================ +# application() 内的 Python 代码会通过 AST 跟踪直接嵌入 +# 生成的 Triton 代码。Triton 的编译环境没有标准 Python 库, +# 因此必须遵守以下规则: +# +# ❌ 禁止:math.*、torch.*、numpy.* +# ❌ 禁止:模块级变量引用(原样嵌入 → NameError) +# ❌ 禁止:** 运算符(Triton tensor 无 __pow__) +# ✅ 允许:ntl.* 函数、字面量数值、四则运算 +# ✅ 推荐:x * x * x 代替 x ** 3 +# ✅ 推荐:0.7978845608028654 代替 math.sqrt(2.0 / math.pi) +""" + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + + +# ============================================================ +# Step 1: 定义符号(编译时常量) +# ============================================================ +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + +# ============================================================ +# Step 2: 通用 arrangement(数据布局) +# ============================================================ +def _element_wise_arrangement(*tensors, block_size): + """通用 element-wise arrangement:保留 strides,支持非连续张量。 + + 工作原理: + - 自动确定所有张量的最大 ndim + - 0 维张量(标量)原样传递,不做 tile + - 高维张量构造 tile_shape = (1, ..., 1, block_size), + 前 ndim-1 维为 1 不做 tile,只在最后一维分块。 + 这样高维张量的行/列 strides 被保留,Triton 能够 + 通过 ptr + row * stride_row + col * stride_col 正确寻址。 + + 注意:标量广播需要通过 expand_as 创建 stride=0 视图调用方完成。 + """ + ndim = max(tensor.ndim for tensor in tensors) + assert all(tensor.ndim == ndim or tensor.ndim == 0 for tensor in tensors) + + tile_shape = tuple(1 for _ in range(ndim - 1)) + (block_size,) + + return tuple( + tensor.tile(tile_shape) if tensor.ndim != 0 else tensor + for tensor in tensors + ) + + +# ============================================================ +# Step 3: 定义 application(计算逻辑) +# ============================================================ +def application(input, output): + output = # noqa: F841 + + +# ============================================================ +# Step 4: 声明 Tensor 元信息 +# ============================================================ +# Tensor(1) 表示 1 维张量 这里的tensors相当于申请了两个一维张量 +tensors = (Tensor(1), Tensor(1)) + + +# ============================================================ +# Step 5: 创建 kernel +# ============================================================ +kernel = ninetoothed.make(arrangement, application, tensors) diff --git a/skills/competition/nineops-skills/templates/failure_diagnosis_template.md b/skills/competition/nineops-skills/templates/failure_diagnosis_template.md new file mode 100644 index 0000000..4f89915 --- /dev/null +++ b/skills/competition/nineops-skills/templates/failure_diagnosis_template.md @@ -0,0 +1,96 @@ +# 故障诊断模板 + +> 在调试过程中按此模板记录问题,便于复现和分析。 + +## 基本信息 + +| 字段 | 值 | +|------|-----| +| 算子 | | +| DSL 模式 | | +| 首次发现日期 | | +| 严重程度 | 高 / 中 / 低 | + +## 问题描述 + +``` +请用1-2句话描述问题。 +``` + +## 复现步骤 + +### 最小复现代码 + +```python +# 用尽可能少的代码复现问题 +``` + +### 输入 + +- shape: +- dtype: +- 是否 contiguous: +- broadcast 配置: + +### 输出对比 + +| 位置 | 预期值 | 实际值 | +|------|--------|--------| +| pos[0] | | | +| pos[-1] | | | +| mean | | | + +## 错误信息 + +``` +粘贴完整错误堆栈。 +``` + +## 诊断过程 + +### 1. dtype 检查 + +```python +# 检查 dtype 链 +``` + +### 2. border/mask 检查 + +```python +# 检查边界处理 +``` + +### 3. Broadcast/Expand 检查 + +```python +# 打印中间 shape +``` + +### 4. Generated Source 检查 + +```python +# generate source 输出 +``` + +## 根因分析 + + + +## 解决方案 + +```python +# 修复后的代码 +``` + +## 验证 + +| 测试 | 修复前 | 修复后 | +|------|--------|--------| +| dtype 测试 | ❌ | ✅ | +| shape 测试 | ❌ | ✅ | +| broadcast 测试 | ❌ | ✅ | +| non-contiguous 测试 | ❌ | ✅ | + +## 防止再次发生 + + diff --git a/skills/competition/nineops-skills/templates/matmul_2d.py b/skills/competition/nineops-skills/templates/matmul_2d.py new file mode 100644 index 0000000..91c0236 --- /dev/null +++ b/skills/competition/nineops-skills/templates/matmul_2d.py @@ -0,0 +1,72 @@ +""" +2D Matmul 算子模板 +================== +适用于:mm, addmm 等矩阵乘法操作 + +模式特征: +- 使用 3 个 block_size 符号(M, N, K)由 autotune 自动搜索 +- output 按 (BLOCK_SIZE_M, BLOCK_SIZE_N) 分块 +- input 和 other 通过 tile + expand + squeeze 对齐到 output 分块 +- application 中使用循环 dot 累加 +""" + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Tensor, block_size + +# ============================================================ +# Step 1: 定义符号(meta 类型,由 autotune 自动搜索) +# ============================================================ +BLOCK_SIZE_M = block_size() +BLOCK_SIZE_N = block_size() +BLOCK_SIZE_K = block_size() + + +# ============================================================ +# Step 2: 定义 arrangement(数据布局) +# ============================================================ +def arrangement( + input, + other, + output, + BLOCK_SIZE_M=BLOCK_SIZE_M, + BLOCK_SIZE_N=BLOCK_SIZE_N, + BLOCK_SIZE_K=BLOCK_SIZE_K, +): + output_arranged = output.tile((BLOCK_SIZE_M, BLOCK_SIZE_N)) + + # input: (M, K) -> (BLOCK_SIZE_M, BLOCK_SIZE_K) -> tile(1, -1) -> expand(-1, N_blocks) + input_arranged = input.tile((BLOCK_SIZE_M, BLOCK_SIZE_K)) + input_arranged = input_arranged.tile((1, -1)) + input_arranged = input_arranged.expand((-1, output_arranged.shape[1])) + input_arranged.dtype = input_arranged.dtype.squeeze(0) + + # other: (K, N) -> (BLOCK_SIZE_K, BLOCK_SIZE_N) -> tile(-1, 1) -> expand(M_blocks, -1) + other_arranged = other.tile((BLOCK_SIZE_K, BLOCK_SIZE_N)) + other_arranged = other_arranged.tile((-1, 1)) + other_arranged = other_arranged.expand((output_arranged.shape[0], -1)) + other_arranged.dtype = other_arranged.dtype.squeeze(1) + + return input_arranged, other_arranged, output_arranged + + +# ============================================================ +# Step 3: 定义 application(计算逻辑) +# ============================================================ +def application(input, other, output): + accumulator = ntl.zeros(output.shape, dtype=ntl.float32) + for k in range(input.shape[0]): + accumulator += ntl.dot(input[k], other[k]) + output = accumulator + + +# ============================================================ +# Step 4: 声明 Tensor 元信息 +# ============================================================ +tensors = (Tensor(2), Tensor(2), Tensor(2)) + + +# ============================================================ +# Step 5: 创建 kernel +# ============================================================ +kernel = ninetoothed.make(arrangement, application, tensors) diff --git a/skills/competition/nineops-skills/templates/operator_task_report_template.md b/skills/competition/nineops-skills/templates/operator_task_report_template.md new file mode 100644 index 0000000..a4c04ff --- /dev/null +++ b/skills/competition/nineops-skills/templates/operator_task_report_template.md @@ -0,0 +1,70 @@ +# 算子任务报告模板 + +> 本文档由 Agent 完成实现后填写,用于记录任务过程、结果和反思。 + +## 基本信息 + +| 字段 | 值 | +|------|-----| +| 算子 | | +| DSL 模式 | | +| 输入 shape | | +| 输入 dtype | | +| BLOCK_SIZE | | +| 实现日期 | | + +## 实现过程 + +### Step 1: 分析算子 + + + +### Step 2: 选择 DSL 模式 + + + +### Step 3: 编写 Implementation + +**Arrangement 设计:** + +```python +# 贴出 arrangement 代码 +``` + +**Application 设计:** + +```python +# 贴出 application 代码 +``` + +### Step 4: Torch 包装 + +```python +# 贴出 warp 代码 +``` + +## Correctness 测试结果 + +| 测试场景 | Shape | Dtype | Broadcast | Contiguous | 结果 | +|----------|-------|-------|-----------|------------|------| +| 基础 | | | N/A | ✅ | ✅/❌ | +| 广播 | | | ✅ | ✅ | ✅/❌ | +| 非连续 | | | N/A | ❌ (transpose) | ✅/❌ | + +## Benchmark 结果 + +| Shape | Dtype | BlockSize | 本实现(ms) | PyTorch(ms) | Speedup | +|-------|-------|-----------|------------|-------------|---------| +| | | | | | | + +## Generated Source 检查 + + + +## 遇到的问题 + +1. + +## 反思 + + diff --git a/skills/competition/nineops-skills/templates/reduction_2d.py b/skills/competition/nineops-skills/templates/reduction_2d.py new file mode 100644 index 0000000..5812b3e --- /dev/null +++ b/skills/competition/nineops-skills/templates/reduction_2d.py @@ -0,0 +1,48 @@ +""" +2D 行归约算子模板 +================= +适用于:softmax, rms_norm, layer_norm 等 + 沿最后一维做归约的操作 + +模式特征: +- 保留第一维(batch),沿第二维做 tile +- BLOCK_SIZE 通常取 input.shape[-1] 以覆盖整行 +- 涉及跨元素归约(sum, max 等) +""" + +import ninetoothed +import ninetoothed.language as ntl +from ninetoothed import Symbol, Tensor + +# ============================================================ +# Step 1: 定义符号 +# ============================================================ +BLOCK_SIZE = Symbol("BLOCK_SIZE", constexpr=True) + + +# ============================================================ +# Step 2: 定义 arrangement(数据布局) +# ============================================================ +def arrangement(input, output, BLOCK_SIZE=BLOCK_SIZE): + return input.tile((1, BLOCK_SIZE)), output.tile((1, BLOCK_SIZE)) + + +# ============================================================ +# Step 3: 定义 application(计算逻辑) +# ============================================================ +def application(input, output): + # 在第二维上做归约 + output = # noqa: F841 + + +# ============================================================ +# Step 4: 声明 Tensor 元信息 +# ============================================================ +# other=float("-inf") 用于给边界外的填充值(如 softmax mask) +tensors = (Tensor(2, other=float("-inf")), Tensor(2)) + + +# ============================================================ +# Step 5: 创建 kernel +# ============================================================ +kernel = ninetoothed.make(arrangement, application, tensors) diff --git a/skills/competition/nineops-skills/templates/wrapper.py b/skills/competition/nineops-skills/templates/wrapper.py new file mode 100644 index 0000000..815f916 --- /dev/null +++ b/skills/competition/nineops-skills/templates/wrapper.py @@ -0,0 +1,41 @@ +""" +Torch 包装层模板 +================ +提供多种常见包装模式,用于将 kernel 接入 PyTorch 接口 +""" + +import torch + + +def flatten_wrapper(kernel_module, input, BLOCK_SIZE=1024): + """ + Flatten 模式:适用于 element-wise / 激活函数 + 将任意 shape 展平后调用 kernel,再恢复原形状 + """ + input_flat = input.flatten() + output_flat = torch.empty_like(input_flat) + kernel_module.kernel(input_flat, output_flat, BLOCK_SIZE=BLOCK_SIZE) + return output_flat.view_as(input) + + +def direct_wrapper(kernel_module, *args, **kwargs): + """ + 直接模式:适用于 mm, bmm, attention 等 + 在包装层创建 output tensor,直接传参 + """ + output = torch.empty(output_shape, dtype=args[0].dtype, device=args[0].device) + kernel_module.kernel(*args, output, **kwargs) + return output + + +def reshape_wrapper(kernel_module, input, *args, BLOCK_SIZE=None): + """ + Reshape 模式:适用于 rms_norm 等需要 view(-1, last_dim) 的操作 + """ + original_shape = input.shape + input_2d = input.view(-1, original_shape[-1]) + output_2d = torch.empty_like(input_2d) + if BLOCK_SIZE is None: + BLOCK_SIZE = original_shape[-1] + kernel_module.kernel(input_2d, *args, output_2d, BLOCK_SIZE=BLOCK_SIZE) + return output_2d.view(original_shape) diff --git a/skills/competition/nineops-skills/tests/expected_outputs.md b/skills/competition/nineops-skills/tests/expected_outputs.md new file mode 100644 index 0000000..28ef888 --- /dev/null +++ b/skills/competition/nineops-skills/tests/expected_outputs.md @@ -0,0 +1,94 @@ +# 期望输出参考 + +以下为各示例在 NVIDIA GPU 上的典型正确性输出。Agent 应能验证输出与预期一致。 + +## elementwise_broadcast_add + +**命令:** `python examples/elementwise_broadcast_add/run.py` + +``` +=== Elementwise Broadcast Add — 6 test cases === + ✔ contiguous (256,) + (256,) → OK + ✔ scalar (256,) + (1,) → OK + ✔ broadcast (4,8) + (8,) → OK + ✔ broadcast (4,8) + (4,1) → OK + ✔ transposed (32,16) contig → OK + ✔ uneven (1000,) + (1,) → OK +All 6 tests passed! +``` + +## reduction_softmax + +**命令:** `python examples/reduction_softmax/run.py` + +``` +=== Reduction Softmax — 8 test cases === + ✔ basic (4,1024) → OK + ✔ multi-row (8,4096) → OK + ✔ uneven cols (4,768) → OK + ✔ single row (1,2048) → OK + ✔ fp16 (4,1024) → OK + ✔ extreme (4,1024) large → OK + ✔ non-contiguous (4,1024) → OK + ✔ prime cols (4,1021) → OK +All 8 tests passed! +``` + +## non_contiguous_stride_case + +**命令:** `python examples/non_contiguous_stride_case/run.py` + +``` +=== Non-Contiguous Stride Tests (10 scenarios) === + ✔ contiguous (128,) → OK + ✔ transposed (32,64) contig → OK + ✔ sliced rows (16,64) view → OK + ✔ sliced cols (64,16) view → OK + ✔ sliced both (32,32) view → OK + ✔ view3d (8,8,8) -> (64,) → OK + ✔ expanded (1,128) -> (4,128) → OK + ✔ permuted (4,32,8) dims → OK + ✔ as_strided (32,64) → OK + ✔ small 1D (15,) → OK +All 10 tests passed! +``` + +## performance_regression_case + +**命令:** `python examples/performance_regression_case/run.py` + +``` +=== Performance Regression — Matmul 2D 示例 === + ✅ BLOCK=16 — M=1024, N=1024, K=1024 + ✅ BLOCK=32 — M=1024, N=1024, K=1024 + ✅ BLOCK=64x64x32 — M=1024, N=1024, K=1024 + ✅ BLOCK=128x128x32 — M=1024, N=1024, K=1024 + ... (256x512x128 同理) +🎉 所有 matmul 变体正确性验证通过! +``` + +**Benchmark 预期趋势:** + +``` +shape | BLOCK=16 | BLOCK=128 | 退化倍数 +(512,512,512)| ~2.0ms | ~0.3ms | ~6x +(1024,...) | ~12ms | ~2.0ms | ~6x +(2048,...) | ~90ms | ~15ms | ~6x +``` + +> 具体数值因 GPU 型号不同会有较大差异,但 **退化倍数 >2x** 是可靠的诊断指标。 + +## 通用断言 (Agent Validator) + +在任何示例上验证时,使用以下逻辑: + +```python +import torch + +def assert_correct(kernel_output: torch.Tensor, expected: torch.Tensor, atol=1e-3): + """确认 kernel 输出与 PyTorch 基准一致。""" + assert torch.allclose(kernel_output, expected, atol=atol, rtol=1e-3), \ + f"max diff={torch.max(torch.abs(kernel_output - expected)).item():.6f}" +``` + +如果该断言通过,则 agent 可确认实现正确。 diff --git a/skills/competition/nineops-skills/tests/selftest_tasks.md b/skills/competition/nineops-skills/tests/selftest_tasks.md new file mode 100644 index 0000000..a7bd12c --- /dev/null +++ b/skills/competition/nineops-skills/tests/selftest_tasks.md @@ -0,0 +1,59 @@ +# Selftest Tasks — Agent 自我校验任务 + +这些任务用来验证 `.skill` 工作区的完整性和 agent 能力。每个任务应能在 **不访问外部网络** 的情况下,仅基于 `.skill` 内部文档完成。 + +## 任务 1: 实现一个 elementwise 加法 kernel + +- 打开 [dsl_patterns.md](../references/dsl_patterns.md) 找到 elementwise_1d 模式 +- 使用 `Tensor` + `Symbol` + 1D arrangement + `application(ntl.add)` + `make` +- 验证: 输入 a(T), b(T) → c(T),结果应与 a+b 一致 +- 参考已有示例: [elementwise_broadcast_add](../examples/elementwise_broadcast_add/run.py) +- 预期耗时: 手动实现 ≤15 min + +## 任务 2: 测试覆盖度检查 + +- 打开 [testing_patterns.md](../references/testing_patterns.md) 检查 4 个维度 (dtype, shape, broadcast, stride) +- 对任务 1 实现的加法 kernel,编写 pytest 参数化测试覆盖至少 6 个组合 +- 验证: pytest --verbose 全部通过 +- 预期耗时: ≤20 min + +## 任务 3: AOT 编译 / 查看生成源码 + +- 对任务1的加法 kernel 调用 scripts/inspect_generated_source.sh 查看生成 Triton IR +- 确认 kernel name、参数签名、loop structure 与预期一致 +- 预期耗时: ≤10 min + +## 任务 4: 性能 Benchmark 分析 + +- 对任务1的加法 kernel,编写 benchmark 对比输入规模: (1024,), (4096,), (16384,) +- 使用 scripts/run_benchmark.sh 执行并保存结果 +- 观察: 是否存在某个规模下 kernel 慢于 PyTorch(当数据量小或带宽受限时可能出现) +- 预期耗时: ≤20 min + +## 任务 5: 识别一个性能退化 + +- 打开 [performance_regression_case](../examples/performance_regression_case/) 的 benchmark.py +- 运行 `python examples/performance_regression_case/benchmark.py` +- 观察: BLOCK_SIZE=16 比 BLOCK_SIZE=128 慢多少倍 +- 预期耗时: ≤30 min (含编译) + +## 任务 6: 非连续 stride 处理 + +- 打开 [non_contiguous_stride_case](../examples/non_contiguous_stride_case/run.py) +- 理解 10 种 stride 变体的测试方法 +- 对任务1的加法 kernel 增加非连续测试 +- 预期耗时: ≤15 min + +## 任务 7: 使用模板报告 + +- 打开 [operator_task_report_template.md](../templates/operator_task_report_template.md) +- 填写任务1~4的完整报告 +- 预期耗时: ≤10 min + +## 任务 8: 故障注入诊断 + +- 人为修改加法 kernel 的 BLOCK_SIZE 为 0 +- 运行新 kernel,观察报错信息 +- 打开 [failure_diagnosis.md](../references/failure_diagnosis.md) 按诊断流程解决 +- 使用 [failure_diagnosis_template.md](../templates/failure_diagnosis_template.md) 记录故障 +- 预期耗时: ≤10 min diff --git a/skills/competition/nineops-skills/tests/trigger_prompts.md b/skills/competition/nineops-skills/tests/trigger_prompts.md new file mode 100644 index 0000000..8b2bdae --- /dev/null +++ b/skills/competition/nineops-skills/tests/trigger_prompts.md @@ -0,0 +1,24 @@ +# Agent 触发 Prompt + +当用户提出以下类型的请求时,请引导其使用 `.skill` 工作区中的已有资源。 + +## 场景 → 触发 → 动作 + +| 场景 | 关键词 | 动作 | +|------|--------|------| +| 实现新算子 | "实现"、"写个 kernel"、"算子" | → 参考 [dsl_patterns.md](../references/dsl_patterns.md) + 选取模板 [operator_task_report_template.md](../templates/operator_task_report_template.md) | +| 跑正确性测试 | "测试"、"验证"、"run" | → 参考 [testing_patterns.md](../references/testing_patterns.md) + 使用 scripts/run_correctness.sh | +| 跑性能 benchmark | "benchmark"、"性能"、"速度"、"加速比" | → 参考 [benchmark_patterns.md](../references/benchmark_patterns.md) + 使用 scripts/run_benchmark.sh | +| 检查生成源码 | "生成的代码"、"triton代码"、"source"、"codegen" | → 参考 [generated_source_and_aot.md](../references/generated_source_and_aot.md) + 使用 scripts/inspect_generated_source.sh | +| 失败诊断 | "报错"、"失败"、"崩溃"、"error"、"错误" | → 参考 [failure_diagnosis.md](../references/failure_diagnosis.md) + 使用模板 [failure_diagnosis_template.md](../templates/failure_diagnosis_template.md) | +| 项目总体理解 | "这个 skill 是干什么的"、"怎么用" | → 阅读 [../README.md](../README.md) (根 README) | +| repo 结构 | "仓库结构"、"ninetoothed 源码" | → 参考 [repo_index.md](../references/repo_index.md) | +| 数据分析 (已跑完) | "分析结果"、"汇总"、"报告" | → 使用 scripts/collect_task_log.py 收集日志 | +| 任务完成 | "完成"、"结束了"、"可以了" | → 使用模板 [operator_task_report_template.md](../templates/operator_task_report_template.md) 生成最终报告 | + +## 提示词短句 + +- "参考 dsl_patterns 实现一个 elementwise 算子" → 打开 references/dsl_patterns.md 找到 elementwise_1d 模式 +- "用 run_correctness.sh 跑测试" → 执行 scripts/run_correctness.sh +- "跑 benchmark 看看速度" → 执行 scripts/run_benchmark.sh +- "排查编译错误" → 打开 references/failure_diagnosis.md 按分类排查