From 1550b9fc11e8608dacba4ea5b10ff7dd7d6ea78a Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Sat, 6 Jun 2026 16:01:55 -0400 Subject: [PATCH 1/7] OCPEDGE-2727: Add eval harness configs for cluster-diagnostic and threat-model skills Add evaluation configs, test cases, and README for two skills: - cluster-diagnostic: 5 cases covering validate and recovery-guide modes - threat-model-tnf: 5 cases covering PR security analysis Co-Authored-By: Claude Opus 4.6 --- plugins/two-node/evals/README.md | 50 ++++ plugins/two-node/evals/cluster-diagnostic.md | 83 ++++++ .../two-node/evals/cluster-diagnostic.yaml | 218 ++++++++++++++++ .../annotations.yaml | 9 + .../input.yaml | 4 + .../annotations.yaml | 5 + .../case-002-validate-safe-redfish/input.yaml | 5 + .../annotations.yaml | 5 + .../input.yaml | 2 + .../annotations.yaml | 5 + .../case-004-recovery-standby/input.yaml | 2 + .../annotations.yaml | 6 + .../case-005-validate-pcs-standby/input.yaml | 4 + plugins/two-node/evals/threat-model-tnf.md | 101 +++++++ plugins/two-node/evals/threat-model-tnf.yaml | 247 ++++++++++++++++++ .../annotations.yaml | 18 ++ .../case-001-shell-script-k8s-api/input.yaml | 3 + .../annotations.yaml | 21 ++ .../input.yaml | 3 + .../annotations.yaml | 16 ++ .../case-003-mac-fencing-lookup/input.yaml | 3 + .../annotations.yaml | 9 + .../input.yaml | 3 + .../annotations.yaml | 15 ++ .../case-005-tnf-retry-bugfix/input.yaml | 3 + 25 files changed, 840 insertions(+) create mode 100644 plugins/two-node/evals/README.md create mode 100644 plugins/two-node/evals/cluster-diagnostic.md create mode 100644 plugins/two-node/evals/cluster-diagnostic.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/annotations.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/input.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/annotations.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/input.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/annotations.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/input.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/annotations.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/input.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/annotations.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/input.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf.md create mode 100644 plugins/two-node/evals/threat-model-tnf.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/annotations.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/input.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/annotations.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/input.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/annotations.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/input.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/annotations.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/input.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/annotations.yaml create mode 100644 plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/input.yaml diff --git a/plugins/two-node/evals/README.md b/plugins/two-node/evals/README.md new file mode 100644 index 00000000..57094e77 --- /dev/null +++ b/plugins/two-node/evals/README.md @@ -0,0 +1,50 @@ +# Evaluation Configs + +Automated quality testing for two-node plugin skills using the +[agent-eval-harness](https://github.com/opendatahub-io/agent-eval-harness) +Claude Code plugin. + +## Available Evals + +| Config | Skill | Modes Tested | Cases | +|--------|-------|--------------|-------| +| `cluster-diagnostic.yaml` | `two-node:cluster-diagnostic` | validate, recovery-guide | 5 | +| `threat-model-tnf.yaml` | `threat-model:tnf` | PR analysis | 5 | + +## Running Locally + +```bash +# Install the eval harness plugin first +/plugin marketplace add opendatahub-skills/agent-eval-harness + +# Run an eval +/eval-run --model claude-opus-4-6 --config evals/cluster-diagnostic.yaml +``` + +## Running in CI + +Comment `/test eval-cluster-diagnostic` on a PR to trigger the eval job. +The CI workflow is defined in +[openshift/release](https://github.com/openshift/release) under +`ci-operator/config/openshift-eng/edge-tooling/`. + +## Directory Structure + +``` +evals/ +├── .yaml # Eval config (judges, thresholds, schema) +├── .md # Cached skill analysis +└── / + └── cases/ + └── case-NNN-/ + ├── input.yaml # Test input + └── annotations.yaml # Expected outcomes +``` + +## Adding a New Eval + +1. `/eval-analyze --skill --config evals/.yaml` +2. `/eval-dataset --config evals/.yaml` +3. `/eval-run --model claude-opus-4-6 --config evals/.yaml` +4. `/eval-review --run-id --config evals/.yaml` +5. Commit the config, analysis, and cases. Run artifacts are ephemeral. diff --git a/plugins/two-node/evals/cluster-diagnostic.md b/plugins/two-node/evals/cluster-diagnostic.md new file mode 100644 index 00000000..345a131a --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic.md @@ -0,0 +1,83 @@ +--- +# Auto-generated by /eval-analyze — edit to override +skill: two-node:cluster-diagnostic +analyzed_at: 2026-06-05T23:00:00Z +skill_hash: bb04c2fed029 + +# Discovered skill capabilities +execution_mode: case +headless: true +dry_run: false + +# Suggested judges (summary from analysis) +suggested_judges: + - name: budget_check + type: builtin + description: "Cost stays within $3.00 per case" + - name: severity_classification + type: check + description: "Validate mode assigns correct BLOCKER/WARNING/INFO severity" + - name: procedure_completeness + type: check + description: "Recovery-guide mode returns bash commands, verification steps, parameter templates" + - name: forbidden_recommendations + type: check + description: "Never recommends pcs standby, sequential shutdown, or shutdown -h" + - name: knowledge_base_accuracy + type: llm + description: "Response accurately reflects TNF knowledge base content" +--- + +# Skill Analysis + +The `two-node:cluster-diagnostic` skill diagnoses TNF (Two-Node Fencing) +cluster issues across 4 modes: diagnose (live SSH), validate (check proposed +procedures), recovery-guide (return correct procedures), and game (interactive +training). The skill encodes 7 validated bare metal test scenarios (HPE ProLiant +e920t, OCP 4.22.0-rc.3) into a knowledge base. + +**Eval scope**: Only `validate` and `recovery-guide` modes are testable in eval +because `diagnose` requires live SSH access and `game` requires interactive +AskUserQuestion handling. Game mode can be tested with tool interception but +adds complexity. + +## Inputs + +Each test case has `input.yaml` with: +- `command_input`: Full argument string (e.g., `validate "cordon, drain, shutdown"`, + `recovery-guide full-shutdown`) +- `mode`: Which mode is being tested (`validate`, `recovery-guide`, `game`) + +And `annotations.yaml` with expected outcomes: +- `expected_blockers`: List of BLOCKER findings expected (validate mode) +- `expected_warnings`: List of WARNING findings expected +- `expected_scenario`: Scenario name (recovery-guide mode) +- `should_reject`: Whether the procedure should be rejected (validate mode) + +## Outputs + +All output is conversational — the skill writes nothing to disk. Judges use +`{{ conversation }}` to evaluate the assistant's response text. + +## Pipeline Flow + +1. Parse argument to determine mode +2. Read `cluster-knowledge-base.md` (800+ lines with 7 failure modes, severity + table, correct procedures, edge cases) +3. For validate: parse procedure text → check each step against 7 failure modes + → report BLOCKER/WARNING/INFO with explanations +4. For recovery-guide: look up scenario → return step-by-step bash commands with + parameter templates and verification steps +5. For game: read `game-mode.md` → present questions via AskUserQuestion → score + +## Quality Criteria + +**Deterministic** (code-checkable): +- Severity classification matches knowledge base table +- Never recommends pcs standby, sequential shutdown, or shutdown -h +- Recovery procedures include bash commands and verification steps + +**LLM judgment** (requires reasoning): +- Response accurately reflects TNF architecture facts +- Failure mode explanations reference correct root causes +- Recovery procedures match validated bare metal test results diff --git a/plugins/two-node/evals/cluster-diagnostic.yaml b/plugins/two-node/evals/cluster-diagnostic.yaml new file mode 100644 index 00000000..4385e9b1 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic.yaml @@ -0,0 +1,218 @@ +name: cluster-diagnostic-eval +description: Evaluate the cluster-diagnostic skill across validate, recovery-guide, and game modes +skill: two-node:cluster-diagnostic + +execution: + mode: case + arguments: "{command_input}" + timeout: 300 + max_budget_usd: 3.0 + +runner: + type: claude-code + plugin_dirs: + - plugins/two-node + +models: + skill: claude-opus-4-6 + judge: claude-opus-4-6 + hook: claude-sonnet-4-6 + +permissions: + allow: [] + deny: [] + +mlflow: + experiment: cluster-diagnostic-eval + +dataset: + path: cluster-diagnostic/cases + schema: | + Each case directory contains: + - input.yaml: YAML file with: + - 'command_input' (string): The full argument string passed to the skill. + For validate mode: 'validate ' + For recovery-guide mode: 'recovery-guide ' + For game mode: 'game' (requires AskUserQuestion interception) + - 'mode' (string): One of 'validate', 'recovery-guide', 'game' + Used by annotation-aware judges to apply mode-specific checks. + - annotations.yaml: Expected outcomes for the test case: + - 'mode' (string): validate | recovery-guide | game + - 'expected_blockers' (list): BLOCKER findings expected (validate mode) + - 'expected_warnings' (list): WARNING findings expected (validate mode) + - 'expected_scenario' (string): scenario name (recovery-guide mode) + - 'should_reject' (bool): whether the procedure should be rejected (validate mode) + + Note: diagnose mode is excluded from eval because it requires live SSH + access to cluster nodes. Test validate and recovery-guide modes which + operate on text input against the knowledge base. + +inputs: + tools: + - match: Questions asked to the user via AskUserQuestion (game mode) + prompt: | + Answer based on the test case context in input.yaml and answers.yaml. + For game mode selection, pick 'quiz' unless answers.yaml specifies otherwise. + For quiz/scenario/rapid-fire answers, use answers.yaml guidance. + Default: pick the first option. + +outputs: + - path: output + schema: | + This skill produces conversation output only — no files are written to disk. + Judges should use {{ conversation }} to evaluate the assistant's response text. + + For validate mode: expect a findings list with BLOCKER/WARNING/INFO severity + classifications, each referencing a failure mode from the knowledge base. + + For recovery-guide mode: expect step-by-step markdown with bash commands + using parameter templates ($BMC_USER, $BMC_PASS, etc.) and verification steps. + + For game mode: expect interactive questions, scoring, and a final rating + (Novice/Operator/Expert/TNF Master). + +traces: + stdout: true + stderr: true + events: true + metrics: true + +judges: + - name: budget_check + builtin: cost_budget + arguments: + max_cost_usd: 3.0 + + - name: severity_classification + description: | + For validate mode: checks that BLOCKER/WARNING/INFO severity is correctly + assigned. Sequential shutdown and pcs standby must be BLOCKER. ForceOff + must be INFO or WARNING, not BLOCKER. + if: "annotations.get('mode') == 'validate'" + check: | + conversation = outputs.get("conversation", "") + ann = outputs.get("annotations", {}) + expected_blockers = ann.get("expected_blockers", []) + should_reject = ann.get("should_reject", False) + + if not conversation: + return (False, "No conversation output found") + + conv_upper = conversation.upper() + has_blocker = "BLOCKER" in conv_upper + + if should_reject and not has_blocker: + return (False, "Procedure should have been rejected with BLOCKER but no BLOCKER found") + + if not should_reject and has_blocker: + return (False, "Procedure should NOT have BLOCKER findings but BLOCKER was found") + + found_blockers = [] + for b in expected_blockers: + if b.lower() in conversation.lower(): + found_blockers.append(b) + + if expected_blockers and not found_blockers: + return (False, f"Expected blockers {expected_blockers} not found in output") + + return (True, f"Severity classification correct. Blockers found: {found_blockers}") + + - name: procedure_completeness + description: | + For recovery-guide mode: checks that the returned procedure includes + bash commands, verification steps, and parameter templates. + if: "annotations.get('mode') == 'recovery-guide'" + check: | + conversation = outputs.get("conversation", "") + + if not conversation: + return (False, "No conversation output found") + + checks = { + "bash_commands": any(marker in conversation for marker in ["```bash", "```sh", "curl ", "pcs ", "oc "]), + "has_verification": any(w in conversation.lower() for w in ["verify", "confirm", "check", "poll", "wait"]), + "has_parameters": any(p in conversation for p in ["$BMC", "$NODE", "BMC_USER", "BMC_PASS", "BMC_HOST"]), + } + + passed = sum(checks.values()) + total = len(checks) + failed = [k for k, v in checks.items() if not v] + + if passed == total: + return (True, f"All {total} completeness checks passed") + else: + return (False, f"{passed}/{total} checks passed. Missing: {failed}") + + - name: forbidden_recommendations + description: | + Checks that the skill never recommends procedures that violate known + failure modes: pcs node standby, sequential shutdown, shutdown -h. + check: | + conversation = outputs.get("conversation", "") + ann = outputs.get("annotations", {}) + mode = ann.get("mode", "") + + if not conversation: + return (False, "No conversation output found") + + # Only check recommendations sections, not quoted failure descriptions + forbidden = [] + + # Check if skill RECOMMENDS (not just mentions) dangerous procedures + conv_lower = conversation.lower() + + recommend_sections = [] + for marker in ["recommend", "suggested", "recovery", "procedure", "steps to"]: + idx = conv_lower.find(marker) + if idx >= 0: + recommend_sections.append(conversation[idx:idx+500]) + + for section in recommend_sections: + sec_lower = section.lower() + if "pcs node standby" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: + forbidden.append("pcs node standby recommended") + if "shutdown -h 1" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: + forbidden.append("shutdown -h 1 recommended") + + if forbidden: + return (False, f"Forbidden recommendations found: {forbidden}") + return (True, "No forbidden procedures recommended") + + - name: knowledge_base_accuracy + description: | + LLM judge that evaluates whether the skill's response accurately + reflects the TNF knowledge base content — correct failure modes, + proper severity classification reasoning, and accurate recovery procedures. + prompt: | + Evaluate whether this cluster-diagnostic skill response is accurate + and complete for the given mode. + + ## Skill Response + {{ conversation }} + + ## Test Case Annotations + {{ annotations }} + + ## Scoring Criteria + + Score 1-5: + - 5: Response is fully accurate, references correct failure modes, + severity is properly justified, procedures match tested bare metal results + - 4: Minor omissions but no inaccuracies, severity is correct + - 3: Mostly accurate but missing important details or has minor inaccuracy + - 2: Contains inaccurate claims about TNF behavior or recommends untested procedures + - 1: Fundamentally incorrect — wrong failure modes, wrong severity, dangerous recommendations + + Return a JSON object: {"score": <1-5>, "rationale": ""} + +thresholds: + budget_check: + min_pass_rate: 1.0 + severity_classification: + min_pass_rate: 0.8 + procedure_completeness: + min_pass_rate: 0.8 + forbidden_recommendations: + min_pass_rate: 1.0 + knowledge_base_accuracy: + min_mean: 3.5 diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/annotations.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/annotations.yaml new file mode 100644 index 00000000..217fedab --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/annotations.yaml @@ -0,0 +1,9 @@ +mode: validate +expected_blockers: + - sequential shutdown + - shutdown -h +expected_warnings: + - cordon + - drain +expected_scenario: null +should_reject: true diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/input.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/input.yaml new file mode 100644 index 00000000..4186c5e7 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-001-validate-sequential-shutdown/input.yaml @@ -0,0 +1,4 @@ +command_input: >- + validate "cordon all nodes, drain workloads, then shut down each node + one at a time using shutdown -h 1 via oc debug" +mode: validate diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/annotations.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/annotations.yaml new file mode 100644 index 00000000..c762f6b6 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/annotations.yaml @@ -0,0 +1,5 @@ +mode: validate +expected_blockers: [] +expected_warnings: [] +expected_scenario: null +should_reject: false diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/input.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/input.yaml new file mode 100644 index 00000000..13b06b84 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-002-validate-safe-redfish/input.yaml @@ -0,0 +1,5 @@ +command_input: >- + validate "Send Redfish GracefulShutdown to both nodes simultaneously + using curl, poll PowerState until Off, then send On to both nodes + to restart" +mode: validate diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/annotations.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/annotations.yaml new file mode 100644 index 00000000..4b741409 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/annotations.yaml @@ -0,0 +1,5 @@ +mode: recovery-guide +expected_blockers: [] +expected_warnings: [] +expected_scenario: full-shutdown +should_reject: false diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/input.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/input.yaml new file mode 100644 index 00000000..58326c61 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-003-recovery-full-shutdown/input.yaml @@ -0,0 +1,2 @@ +command_input: recovery-guide full-shutdown +mode: recovery-guide diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/annotations.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/annotations.yaml new file mode 100644 index 00000000..66d323bb --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/annotations.yaml @@ -0,0 +1,5 @@ +mode: recovery-guide +expected_blockers: [] +expected_warnings: [] +expected_scenario: standby +should_reject: false diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/input.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/input.yaml new file mode 100644 index 00000000..dda90015 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-004-recovery-standby/input.yaml @@ -0,0 +1,2 @@ +command_input: recovery-guide standby +mode: recovery-guide diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/annotations.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/annotations.yaml new file mode 100644 index 00000000..f0ee61ee --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/annotations.yaml @@ -0,0 +1,6 @@ +mode: validate +expected_blockers: + - pcs node standby +expected_warnings: [] +expected_scenario: null +should_reject: true diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/input.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/input.yaml new file mode 100644 index 00000000..9ef90652 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-005-validate-pcs-standby/input.yaml @@ -0,0 +1,4 @@ +command_input: >- + validate "Put both nodes in standby using pcs node standby --all, + wait for resources to stop, then power off the servers" +mode: validate diff --git a/plugins/two-node/evals/threat-model-tnf.md b/plugins/two-node/evals/threat-model-tnf.md new file mode 100644 index 00000000..4dc11d23 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf.md @@ -0,0 +1,101 @@ +--- +# Auto-generated by /eval-analyze — edit to override +skill: threat-model:tnf +analyzed_at: 2026-06-05T00:00:00Z +skill_hash: ca8e410b0d9b + +# Discovered skill capabilities +execution_mode: case +headless: true +dry_run: false + +# Suggested judges (summary from analysis) +suggested_judges: + - name: budget_check + type: builtin + description: "Cost stays under $8 per invocation" + - name: report_exists + type: check + description: "PR-THREAT-MODEL-.md file was generated" + - name: report_sections_complete + type: check + description: "All 9 required sections present in report" + - name: dfd_elements_mapped + type: check + description: "DFD element IDs (P/DS/DF/EE/TB) referenced in report" + - name: stride_matrix_present + type: check + description: "Per-element STRIDE matrix has X/~/-/N/A markers" + - name: mitre_techniques_assigned + type: check + description: "MITRE ATT&CK technique IDs (T####) are present" + - name: threat_analysis_quality + type: llm + description: "Overall quality: severity accuracy, DFD mapping, STRIDE completeness, recommendations" + - name: findings_tracker_updated + type: check + description: "Cumulative findings tracker was appended" +--- + +## Skill Analysis + +The `threat-model:tnf` skill performs security threat analysis on GitHub PRs affecting the TNF (Two-Node Fencing) OpenShift topology. It combines three approaches: + +1. **Automated scanning** — runs ShellCheck on shell scripts in the PR diff +2. **Pattern detection** — searches for command injection, credential exposure, privilege escalation, and 7 other security pattern categories +3. **Formal threat modeling** — maps code changes to TNF DFD elements (8 processes, 5 data stores, 12 data flows, 3 external entities, 6 trust boundaries), applies per-element STRIDE analysis, and cross-references against the formal TNF threat model + +Output is a formal threat model report with MITRE ATT&CK technique mappings, OWASP Top 10:2025 categorization, risk assessment, and actionable recommendations for developers and customers. + +## Inputs + +Each test case provides a single PR identifier via `input.yaml`: + +- **`pr_input`** — the PR to analyze, in one of three formats: + - PR number only: `2156` (repo detected from working directory) + - GitHub URL: `https://github.com/ClusterLabs/resource-agents/pull/2156` + - Repo + number: `resource-agents 2156` + +The PR must be a real, accessible GitHub PR. The skill uses `gh pr view` and `gh pr diff` to fetch PR data. + +Optional fields: `repo` (repository name), `org` (GitHub organization). + +## Outputs + +The skill writes to a `reports/` directory (resolved via workspace discovery): + +- **`PR-THREAT-MODEL-.md`** — main threat model report (~200-500 lines) +- **`VULN-PR-.md`** — individual vulnerability tickets (Critical/High only, optional) + +It also appends to a cumulative findings tracker at `$WORKSPACE/.claude/skills/threat-model/mitre-findings-tnf.md`. + +## Pipeline Flow + +1. **Workspace discovery** — walk up from CWD looking for `repos/` directory; set WORKSPACE, REPOS, THREAT_MODEL_DIR, REPORT_DIR, FINDINGS_FILE +2. **Parse input** — extract org, repo, PR number from the three input formats +3. **Fetch PR** — `gh pr view` for metadata, `gh pr diff` for the full diff +4. **ShellCheck** — run on any .sh files; map security codes (SC2086→T1059) to MITRE +5. **Pattern analysis** — search diff for 10 security pattern categories +6. **DFD mapping** — match code paths to TNF elements using the mapping table in `dfd-elements-tnf.md` +7. **STRIDE analysis** — per-element threat assessment; cross-reference against TNF-THREAT-MODEL.md if available +8. **Combine findings** — deduplicate, assign VULN-N IDs, determine severity +9. **MITRE/OWASP mapping** — assign technique IDs and OWASP categories using reference files +10. **Generate report** — write markdown report using report-templates.md format +11. **Append tracker** — add findings block to cumulative mitre-findings-tnf.md + +## Quality Criteria + +A **good** report: +- Correctly identifies all affected DFD elements from the code paths in the PR +- Applies STRIDE systematically to each element (all 6 categories for processes, T/I/D for stores and flows) +- Assigns accurate severity levels matching MITRE/OWASP standards +- Identifies trust boundary crossings (especially TB3→TB4, TB4→TB5) +- Provides specific, actionable recommendations with code-level guidance +- Maps findings to correct MITRE techniques (T1059 for injection, T1552 for credentials, T1611 for container escape) + +A **bad** report: +- Misses affected DFD elements or assigns wrong elements to code paths +- Has incomplete STRIDE matrix (missing categories or missing rationale) +- Over/under-rates severity (e.g., calling a minor code quality issue "Critical") +- Provides vague recommendations ("improve security") without specific guidance +- Missing sections or incorrect report structure diff --git a/plugins/two-node/evals/threat-model-tnf.yaml b/plugins/two-node/evals/threat-model-tnf.yaml new file mode 100644 index 00000000..672480b4 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf.yaml @@ -0,0 +1,247 @@ +name: threat-model-tnf-eval +description: Evaluate the threat-model:tnf skill — PR security analysis with STRIDE/DFD, MITRE ATT&CK, and OWASP mapping for TNF topology +skill: threat-model:tnf + +execution: + mode: case + arguments: "{pr_input}" + timeout: 600 + max_budget_usd: 8.0 + +runner: + type: claude-code + plugin_dirs: + - plugins/threat-model + +models: + skill: claude-opus-4-6 + judge: claude-opus-4-6 + +permissions: + allow: [] + deny: [] + +mlflow: + experiment: threat-model-tnf-eval + +dataset: + path: threat-model-tnf/cases + schema: | + Each case directory contains: + - input.yaml: YAML file with fields: + - 'pr_input': the PR identifier to analyze — one of: + - A PR number (e.g., '2156') + - A GitHub URL (e.g., 'https://github.com/ClusterLabs/resource-agents/pull/2156') + - A 'repo number' pair (e.g., 'resource-agents 2156') + [EXTERNAL: GitHub] — must be a real, accessible PR on GitHub + - 'repo' (optional): repository name for context (e.g., 'resource-agents') + - 'org' (optional): GitHub org (e.g., 'ClusterLabs', 'openshift') + - reference.md (optional): gold-standard threat model report for comparison. + Uses the report template format: Executive Summary, DFD Impact Analysis, + Per-Element STRIDE matrix, Threat Analysis (VULN-N sections), MITRE/OWASP + mapping, Risk Assessment, and Recommendations. + - annotations.yaml (optional): expected metadata for outcome-aware scoring: + - 'expected_vuln_count': expected number of findings + - 'expected_severities': list of expected severity levels + - 'affected_dfd_elements': list of expected DFD element IDs (e.g., ['P5', 'P7', 'DS3']) + - 'expected_mitre_techniques': list of expected MITRE technique IDs + - 'has_shell_scripts': whether the PR contains shell scripts for ShellCheck + - 'has_trust_boundary_crossing': whether the PR crosses trust boundaries + +outputs: + - path: reports + schema: | + The skill writes threat model reports as markdown files: + - PR-THREAT-MODEL-.md: main report with sections: + - Document header (version, date, classification, repo, topology, author, URL) + - Executive Summary with findings count table (Critical/High/Medium/Low) + - Change Overview describing the PR and security-relevant changes + - Affected Files table (file path, line changes, security relevance) + - DFD Impact Analysis: + - Affected DFD Elements table (Element ID, Name, Impact, Trust Boundary) + - Trust Boundary Crossings narrative + - Per-Element STRIDE matrix (S/T/R/I/D/E per element, X/~/-/N/A) + - Threat Model Cross-Reference table (PE-* IDs if formal model exists) + - Automated Scanner Results (ShellCheck table or "skipped" note) + - Threat Analysis: per-VULN section with Severity, OWASP, MITRE, CWE, + Affected Code, Description, Attack Vector, Impact (CIA), Recommended Fix + - OWASP & MITRE ATT&CK Mapping table + - Risk Assessment table (Likelihood, Impact, Risk) + - Recommendations (Developers: Before/After Merge; Customers: Config/Ops) + - References + - VULN-PR-.md (optional): individual vulnerability tickets + for Critical/High findings only + +traces: + stdout: true + stderr: true + events: true + metrics: true + +judges: + - name: budget_check + builtin: cost_budget + arguments: + max_cost_usd: 8.0 + + - name: report_exists + description: Verify that the main threat model report markdown file was generated + check: | + files = outputs.get("files", {}) + reports = [k for k in files if "THREAT-MODEL" in k and k.endswith(".md")] + if not reports: + return (False, "No threat model report file found") + return (True, f"Report generated: {reports[0]}") + + - name: report_sections_complete + description: Verify all required report sections are present in the generated report + check: | + files = outputs.get("files", {}) + reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + if not reports: + return (False, "No report file found") + content = list(reports.values())[0] + required = [ + "Executive Summary", + "Change Overview", + "Affected Files", + "DFD Impact Analysis", + "STRIDE", + "Threat Analysis", + "MITRE", + "Risk Assessment", + "Recommendations", + ] + missing = [s for s in required if s not in content] + if missing: + return (False, f"Missing sections: {', '.join(missing)}") + return (True, f"All {len(required)} required sections present") + + - name: dfd_elements_mapped + description: Verify that DFD elements (P1-P8, DS1-DS5, DF1-DF12) are referenced in the report + check: | + import re + files = outputs.get("files", {}) + reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + if not reports: + return (False, "No report file found") + content = list(reports.values())[0] + elements = re.findall(r'\b(P[1-8]|DS[1-5]|DF(?:1[0-2]|[1-9])|EE[1-3]|TB[1-6])\b', content) + unique = set(elements) + if not unique: + return (False, "No DFD elements found in report") + return (True, f"DFD elements referenced: {sorted(unique)}") + + - name: stride_matrix_present + description: Verify the per-element STRIDE matrix is populated with X, ~, or - markers + check: | + import re + files = outputs.get("files", {}) + reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + if not reports: + return (False, "No report file found") + content = list(reports.values())[0] + stride_section = content.split("Per-Element STRIDE") + if len(stride_section) < 2: + return (False, "No Per-Element STRIDE section found") + markers = re.findall(r'\b[XxNn/Aa~-]\b', stride_section[1][:2000]) + if len(markers) < 3: + return (False, f"STRIDE matrix appears empty or minimal ({len(markers)} markers)") + return (True, f"STRIDE matrix populated ({len(markers)} cell markers found)") + + - name: mitre_techniques_assigned + description: Verify MITRE ATT&CK technique IDs (T####) are present and mapped to findings + check: | + import re + files = outputs.get("files", {}) + reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + if not reports: + return (False, "No report file found") + content = list(reports.values())[0] + techniques = set(re.findall(r'T\d{4}', content)) + if not techniques: + return (False, "No MITRE ATT&CK technique IDs found") + return (True, f"MITRE techniques: {sorted(techniques)}") + + - name: threat_analysis_quality + description: | + LLM judge assessing overall threat analysis quality: severity accuracy, + DFD mapping correctness, STRIDE completeness, and recommendation actionability + prompt: | + You are evaluating a TNF (Two-Node Fencing) PR threat analysis report. + + ## Report output: + + {{ outputs }} + + ## Skill conversation: + + {{ conversation }} + + ## Evaluation criteria + + Score on a 1-5 scale across these dimensions, then give an overall score: + + **1. Severity accuracy** — do assigned severities (Critical/High/Medium/Low) match the actual risk? + - Critical: RCE, credential exposure at high-trust boundary (P5/P6/P8), STONITH bypass + - High: command injection with exploitation path, new credential dependency, missing validation on network boundary + - Medium: fail-open behavior, non-critical info disclosure, potential race condition + - Low: minor code quality, non-exploitable pattern + + **2. DFD mapping correctness** — are code changes correctly mapped to TNF DFD elements (P1-P8, DS1-DS5, DF1-DF12)? + - Code paths should match the element mapping table (e.g., cluster-etcd-operator/pkg/tnf/fencing/ → P5) + - Trust boundary crossings should be identified (TB2→TB3, TB3→TB4, TB4→TB5) + + **3. STRIDE completeness** — is each affected element analyzed across all applicable STRIDE categories? + - Processes: all 6 (S,T,R,I,D,E) + - Data Stores: T,I,D + - Data Flows: T,I,D + - External Entities: S,R + + **4. MITRE/OWASP accuracy** — are technique assignments correct? + - T1059 for command injection, T1552 for credential exposure, T1611 for container escape + - OWASP categories should match (A05 for injection, A07 for auth failures) + + **5. Recommendation quality** — are recommendations specific and actionable? + - Developer recommendations should include code-level guidance + - Customer recommendations should include hardening or monitoring steps + - Vague recommendations ("improve security") score low + + ## Scoring + Score 1: Report is missing major sections, contains incorrect mappings, or has no useful findings + Score 2: Report exists but has significant gaps — missing STRIDE analysis, wrong DFD elements, or vague recommendations + Score 3: Adequate report covering basics — correct elements identified, some STRIDE analysis, generic recommendations + Score 4: Good report — accurate DFD mapping, thorough STRIDE, relevant MITRE techniques, specific recommendations + Score 5: Excellent — comprehensive coverage, all trust boundaries analyzed, accurate severity, actionable recommendations with code examples + + Respond with a single integer score (1-5) on the first line, then explain your reasoning. + + - name: findings_tracker_updated + description: Verify the findings tracker was appended with new entries (checks conversation for append confirmation) + check: | + conv = outputs.get("conversation", "") + files = outputs.get("files", {}) + tracker_files = [k for k in files if "mitre-findings" in k.lower()] + if tracker_files: + return (True, f"Findings tracker file found: {tracker_files[0]}") + if "findings" in conv.lower() and ("append" in conv.lower() or "tracker" in conv.lower()): + return (True, "Findings tracker update mentioned in conversation") + return (False, "No evidence of findings tracker update") + +thresholds: + budget_check: + min_pass_rate: 1.0 + report_exists: + min_pass_rate: 1.0 + report_sections_complete: + min_pass_rate: 1.0 + dfd_elements_mapped: + min_pass_rate: 1.0 + stride_matrix_present: + min_pass_rate: 0.8 + mitre_techniques_assigned: + min_pass_rate: 1.0 + threat_analysis_quality: + min_mean: 3.5 + findings_tracker_updated: + min_pass_rate: 0.8 diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/annotations.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/annotations.yaml new file mode 100644 index 00000000..751425b7 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/annotations.yaml @@ -0,0 +1,18 @@ +description: > + Shell script PR that adds kubeconfig-based K8s API access to the podman-etcd + OCF agent. Introduces new trust boundary crossing (TB4→TB2) and credential + dependency. Should trigger ShellCheck analysis and identify credential exposure. +has_shell_scripts: true +has_trust_boundary_crossing: true +expected_severities: + - High + - Medium + - Low +affected_dfd_elements: + - P7 + - DS5 + - DF11 +expected_mitre_techniques: + - T1552 + - T1078 + - T1005 diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/input.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/input.yaml new file mode 100644 index 00000000..37361080 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-001-shell-script-k8s-api/input.yaml @@ -0,0 +1,3 @@ +pr_input: "https://github.com/ClusterLabs/resource-agents/pull/2156" +repo: resource-agents +org: ClusterLabs diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/annotations.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/annotations.yaml new file mode 100644 index 00000000..7d6e2436 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/annotations.yaml @@ -0,0 +1,21 @@ +description: > + Adds a TNF fencing credentials rotation script. This touches the full credential + flow path (DS2→P5→DS3) and should identify high-severity findings around + credential handling, STONITH configuration, and BMC access. Complex case with + multiple DFD elements affected. +has_shell_scripts: true +has_trust_boundary_crossing: true +expected_severities: + - Critical + - High + - Medium +affected_dfd_elements: + - P5 + - DS2 + - DS3 + - DF4 + - DF7 +expected_mitre_techniques: + - T1552 + - T1059 + - T1529 diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/input.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/input.yaml new file mode 100644 index 00000000..cd609686 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-002-credential-rotation-script/input.yaml @@ -0,0 +1,3 @@ +pr_input: "cluster-etcd-operator 1611" +repo: cluster-etcd-operator +org: openshift diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/annotations.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/annotations.yaml new file mode 100644 index 00000000..b62f1b9c --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/annotations.yaml @@ -0,0 +1,16 @@ +description: > + Adds MAC-address based fencing credentials lookup. Introduces a new data flow + for credential resolution and modifies the fencing job's credential discovery + path. Tests DFD mapping for P5 and the credential pipeline. +has_shell_scripts: false +has_trust_boundary_crossing: true +expected_severities: + - High + - Medium +affected_dfd_elements: + - P5 + - DS2 + - DF4 +expected_mitre_techniques: + - T1552 + - T1078 diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/input.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/input.yaml new file mode 100644 index 00000000..34abba7b --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-003-mac-fencing-lookup/input.yaml @@ -0,0 +1,3 @@ +pr_input: "https://github.com/openshift/cluster-etcd-operator/pull/1600" +repo: cluster-etcd-operator +org: openshift diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/annotations.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/annotations.yaml new file mode 100644 index 00000000..7a2fc3ac --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/annotations.yaml @@ -0,0 +1,9 @@ +description: > + Trivial indentation fix in nfsserver — not TNF-specific, no shell scripts + relevant to TNF. Should produce a report with minimal or no security findings. + Edge case testing the skill's handling of low-risk, non-TNF PRs. +has_shell_scripts: false +has_trust_boundary_crossing: false +expected_severities: [] +affected_dfd_elements: [] +expected_mitre_techniques: [] diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/input.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/input.yaml new file mode 100644 index 00000000..f883d7a3 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-004-trivial-indentation-fix/input.yaml @@ -0,0 +1,3 @@ +pr_input: "https://github.com/ClusterLabs/resource-agents/pull/2168" +repo: resource-agents +org: ClusterLabs diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/annotations.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/annotations.yaml new file mode 100644 index 00000000..d9d3e7b2 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/annotations.yaml @@ -0,0 +1,15 @@ +description: > + Bug fix gating dual-replica setup and adding retry logic in TNF pipeline. + Modifies P4 (Setup Job) behavior. Tests whether the skill correctly identifies + denial-of-service risk from retry logic changes and setup gate modifications. + Uses bare PR number format — tests repo auto-detection from CWD. +has_shell_scripts: false +has_trust_boundary_crossing: false +expected_severities: + - Medium + - Low +affected_dfd_elements: + - P4 + - P2 +expected_mitre_techniques: + - T1499 diff --git a/plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/input.yaml b/plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/input.yaml new file mode 100644 index 00000000..2ccb92a2 --- /dev/null +++ b/plugins/two-node/evals/threat-model-tnf/cases/case-005-tnf-retry-bugfix/input.yaml @@ -0,0 +1,3 @@ +pr_input: "1620" +repo: cluster-etcd-operator +org: openshift From 9cc0f5be2d66720e2cf420aeb6e6f6c8fc027f5a Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Sun, 7 Jun 2026 16:56:43 -0400 Subject: [PATCH 2/7] Update evals README with detailed pipeline steps Co-Authored-By: Claude Opus 4.6 --- plugins/two-node/evals/README.md | 41 ++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/plugins/two-node/evals/README.md b/plugins/two-node/evals/README.md index 57094e77..bf30e5a6 100644 --- a/plugins/two-node/evals/README.md +++ b/plugins/two-node/evals/README.md @@ -14,13 +14,15 @@ Claude Code plugin. ## Running Locally ```bash -# Install the eval harness plugin first +# Install the eval harness plugin /plugin marketplace add opendatahub-skills/agent-eval-harness -# Run an eval +# Run an existing eval /eval-run --model claude-opus-4-6 --config evals/cluster-diagnostic.yaml ``` +To create a new eval, see [Adding a New Eval](#adding-a-new-eval) below. + ## Running in CI Comment `/test eval-cluster-diagnostic` on a PR to trigger the eval job. @@ -43,8 +45,33 @@ evals/ ## Adding a New Eval -1. `/eval-analyze --skill --config evals/.yaml` -2. `/eval-dataset --config evals/.yaml` -3. `/eval-run --model claude-opus-4-6 --config evals/.yaml` -4. `/eval-review --run-id --config evals/.yaml` -5. Commit the config, analysis, and cases. Run artifacts are ephemeral. +1. **Analyze the skill** — reads SKILL.md, designs judges, writes the eval config + ``` + /eval-analyze --skill --config evals/.yaml + ``` + +2. **Generate test cases** — creates `input.yaml` + `annotations.yaml` per case + ``` + /eval-dataset --config evals/.yaml + ``` + +3. **Run the eval** — executes the skill against each case, scores with judges, generates HTML report + ``` + /eval-run --model claude-opus-4-6 --config evals/.yaml + ``` + +4. **Review results** — walk through cases, collect human feedback + ``` + /eval-review --run-id --config evals/.yaml + ``` + +5. **(Optional) Optimize** — auto-fix SKILL.md based on judge failures, re-run to verify + ``` + /eval-optimize --config evals/.yaml + ``` + +6. **Commit and CI** + - Commit `evals/.yaml`, `evals/.md`, and `evals//cases/` to this repo + - Add a CI entry in [openshift/release](https://github.com/openshift/release) + pointing `EVAL_CONFIG` to the yaml path + - PR reviewers can then trigger the eval with `/test eval-` From a82011c2d6841c7bc8d80d5d8ebf2c0af6184811 Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Mon, 8 Jun 2026 08:54:31 -0400 Subject: [PATCH 3/7] Fix dataset.path to use repo-root-relative paths The eval harness resolves dataset.path from the repo root, not relative to the config file. Both configs were using short relative paths that broke when running from different working directories. Co-Authored-By: Claude Opus 4.6 --- plugins/two-node/evals/cluster-diagnostic.yaml | 2 +- plugins/two-node/evals/threat-model-tnf.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/two-node/evals/cluster-diagnostic.yaml b/plugins/two-node/evals/cluster-diagnostic.yaml index 4385e9b1..35d8c75a 100644 --- a/plugins/two-node/evals/cluster-diagnostic.yaml +++ b/plugins/two-node/evals/cluster-diagnostic.yaml @@ -26,7 +26,7 @@ mlflow: experiment: cluster-diagnostic-eval dataset: - path: cluster-diagnostic/cases + path: plugins/two-node/evals/cluster-diagnostic/cases schema: | Each case directory contains: - input.yaml: YAML file with: diff --git a/plugins/two-node/evals/threat-model-tnf.yaml b/plugins/two-node/evals/threat-model-tnf.yaml index 672480b4..2d5f3b1b 100644 --- a/plugins/two-node/evals/threat-model-tnf.yaml +++ b/plugins/two-node/evals/threat-model-tnf.yaml @@ -25,7 +25,7 @@ mlflow: experiment: threat-model-tnf-eval dataset: - path: threat-model-tnf/cases + path: plugins/two-node/evals/threat-model-tnf/cases schema: | Each case directory contains: - input.yaml: YAML file with fields: From 3a5ee91ea5b1f8ea5736c6b42a8d094f71829ed9 Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Thu, 11 Jun 2026 11:33:19 -0400 Subject: [PATCH 4/7] Add game mode eval case and improve judges - Add case-006-game-quiz with quiz mode test case and answers - Add warning_classification judge for expected WARNING findings - Add game_mode_scoring judge for rating/score validation - Fix forbidden_recommendations to check 'shutdown -h' (not 'shutdown -h 1') - Update severity_classification description for clarity - Drop models.skill default (let CLI --model flag control it) - Simplify schema note to only exclude diagnose mode Co-Authored-By: Claude Opus 4.6 --- .../two-node/evals/cluster-diagnostic.yaml | 63 ++++++++++++++++--- .../cases/case-006-game-quiz/annotations.yaml | 5 ++ .../cases/case-006-game-quiz/answers.yaml | 6 ++ .../cases/case-006-game-quiz/input.yaml | 2 + 4 files changed, 69 insertions(+), 7 deletions(-) create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/annotations.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/answers.yaml create mode 100644 plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/input.yaml diff --git a/plugins/two-node/evals/cluster-diagnostic.yaml b/plugins/two-node/evals/cluster-diagnostic.yaml index 35d8c75a..ec29c31e 100644 --- a/plugins/two-node/evals/cluster-diagnostic.yaml +++ b/plugins/two-node/evals/cluster-diagnostic.yaml @@ -14,7 +14,6 @@ runner: - plugins/two-node models: - skill: claude-opus-4-6 judge: claude-opus-4-6 hook: claude-sonnet-4-6 @@ -44,8 +43,7 @@ dataset: - 'should_reject' (bool): whether the procedure should be rejected (validate mode) Note: diagnose mode is excluded from eval because it requires live SSH - access to cluster nodes. Test validate and recovery-guide modes which - operate on text input against the knowledge base. + access to cluster nodes. inputs: tools: @@ -85,9 +83,9 @@ judges: - name: severity_classification description: | - For validate mode: checks that BLOCKER/WARNING/INFO severity is correctly - assigned. Sequential shutdown and pcs standby must be BLOCKER. ForceOff - must be INFO or WARNING, not BLOCKER. + For validate mode: checks that expected BLOCKER findings are present + and procedures with blockers are rejected. Sequential shutdown and + pcs standby must be BLOCKER. if: "annotations.get('mode') == 'validate'" check: | conversation = outputs.get("conversation", "") @@ -117,6 +115,30 @@ judges: return (True, f"Severity classification correct. Blockers found: {found_blockers}") + - name: warning_classification + description: | + For validate mode: checks that expected WARNING findings are present + in the output. Verifies the skill identifies non-blocking issues. + if: "annotations.get('mode') == 'validate'" + check: | + conversation = outputs.get("conversation", "") + ann = outputs.get("annotations", {}) + expected_warnings = ann.get("expected_warnings", []) + + if not conversation: + return (False, "No conversation output found") + + if not expected_warnings: + return (True, "No warnings expected for this case") + + conv_lower = conversation.lower() + found = [w for w in expected_warnings if w.lower() in conv_lower] + missing = [w for w in expected_warnings if w.lower() not in conv_lower] + + if missing: + return (False, f"Expected warnings not found: {missing}. Found: {found}") + return (True, f"All expected warnings found: {found}") + - name: procedure_completeness description: | For recovery-guide mode: checks that the returned procedure includes @@ -171,13 +193,36 @@ judges: sec_lower = section.lower() if "pcs node standby" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: forbidden.append("pcs node standby recommended") - if "shutdown -h 1" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: + if "shutdown -h" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: forbidden.append("shutdown -h 1 recommended") if forbidden: return (False, f"Forbidden recommendations found: {forbidden}") return (True, "No forbidden procedures recommended") + - name: game_mode_scoring + description: | + For game mode: checks that the skill produces a score and a + final rating (Novice/Operator/Expert/TNF Master). + if: "annotations.get('mode') == 'game'" + check: | + conversation = outputs.get("conversation", "") + + if not conversation: + return (False, "No conversation output found") + + conv_lower = conversation.lower() + ratings = ["novice", "operator", "expert", "tnf master"] + found_rating = [r for r in ratings if r in conv_lower] + + has_score = any(w in conv_lower for w in ["score", "points", "/"]) + + if not found_rating: + return (False, "No rating (Novice/Operator/Expert/TNF Master) found") + if not has_score: + return (False, "No score or points found in output") + return (True, f"Game completed with rating: {found_rating[0]}") + - name: knowledge_base_accuracy description: | LLM judge that evaluates whether the skill's response accurately @@ -210,9 +255,13 @@ thresholds: min_pass_rate: 1.0 severity_classification: min_pass_rate: 0.8 + warning_classification: + min_pass_rate: 0.8 procedure_completeness: min_pass_rate: 0.8 forbidden_recommendations: min_pass_rate: 1.0 + game_mode_scoring: + min_pass_rate: 1.0 knowledge_base_accuracy: min_mean: 3.5 diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/annotations.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/annotations.yaml new file mode 100644 index 00000000..3b953d22 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/annotations.yaml @@ -0,0 +1,5 @@ +mode: game +expected_blockers: [] +expected_warnings: [] +expected_scenario: null +should_reject: false diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/answers.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/answers.yaml new file mode 100644 index 00000000..44678d18 --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/answers.yaml @@ -0,0 +1,6 @@ +game_mode: quiz +answer_correctly: true +difficulty_guidance: > + Answer TNF knowledge questions accurately based on the + cluster-knowledge-base content. Pick the most correct option + for each question. diff --git a/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/input.yaml b/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/input.yaml new file mode 100644 index 00000000..d7e8057b --- /dev/null +++ b/plugins/two-node/evals/cluster-diagnostic/cases/case-006-game-quiz/input.yaml @@ -0,0 +1,2 @@ +command_input: "game" +mode: game From c5b694bb3e3ecab36c905a08a060c44b3d77600a Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Thu, 11 Jun 2026 15:21:14 -0400 Subject: [PATCH 5/7] Reframe evals README from testing to scoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Evals score skill quality on a spectrum (1-5), not pass/fail. Update terminology to reflect this: testing→scoring, test cases→scenarios, test input→scenario input. Add game mode to cluster-diagnostic case count. Co-Authored-By: Claude Opus 4.6 --- plugins/two-node/evals/README.md | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/plugins/two-node/evals/README.md b/plugins/two-node/evals/README.md index bf30e5a6..45bfdeb6 100644 --- a/plugins/two-node/evals/README.md +++ b/plugins/two-node/evals/README.md @@ -1,14 +1,18 @@ # Evaluation Configs -Automated quality testing for two-node plugin skills using the +Automated quality scoring for two-node plugin skills using the [agent-eval-harness](https://github.com/opendatahub-io/agent-eval-harness) Claude Code plugin. +Evals measure skill quality on a spectrum (judges score 1-5, not +pass/fail) — they catch regressions and drift, not exact-match +correctness. + ## Available Evals | Config | Skill | Modes Tested | Cases | |--------|-------|--------------|-------| -| `cluster-diagnostic.yaml` | `two-node:cluster-diagnostic` | validate, recovery-guide | 5 | +| `cluster-diagnostic.yaml` | `two-node:cluster-diagnostic` | validate, recovery-guide, game | 6 | | `threat-model-tnf.yaml` | `threat-model:tnf` | PR analysis | 5 | ## Running Locally @@ -39,7 +43,7 @@ evals/ └── / └── cases/ └── case-NNN-/ - ├── input.yaml # Test input + ├── input.yaml # Scenario input └── annotations.yaml # Expected outcomes ``` @@ -50,7 +54,7 @@ evals/ /eval-analyze --skill --config evals/.yaml ``` -2. **Generate test cases** — creates `input.yaml` + `annotations.yaml` per case +2. **Generate scenarios** — creates `input.yaml` + `annotations.yaml` per case ``` /eval-dataset --config evals/.yaml ``` From d7c18572832b9a241b8f16387e366a95e4fa831c Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Tue, 16 Jun 2026 09:01:39 -0400 Subject: [PATCH 6/7] Fix eval judge budget threshold and report filename matching Raise cluster-diagnostic budget to $8 to cover full 6-scenario run cost. Relax threat-model report filename match from "THREAT-MODEL" to "THREAT" to handle naming variants (THREAT-REPORT, THREAT-ANALYSIS). Co-Authored-By: Claude Opus 4.6 --- plugins/two-node/evals/cluster-diagnostic.yaml | 4 ++-- plugins/two-node/evals/threat-model-tnf.yaml | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/plugins/two-node/evals/cluster-diagnostic.yaml b/plugins/two-node/evals/cluster-diagnostic.yaml index ec29c31e..34449944 100644 --- a/plugins/two-node/evals/cluster-diagnostic.yaml +++ b/plugins/two-node/evals/cluster-diagnostic.yaml @@ -6,7 +6,7 @@ execution: mode: case arguments: "{command_input}" timeout: 300 - max_budget_usd: 3.0 + max_budget_usd: 8.0 runner: type: claude-code @@ -79,7 +79,7 @@ judges: - name: budget_check builtin: cost_budget arguments: - max_cost_usd: 3.0 + max_cost_usd: 8.0 - name: severity_classification description: | diff --git a/plugins/two-node/evals/threat-model-tnf.yaml b/plugins/two-node/evals/threat-model-tnf.yaml index 2d5f3b1b..ae628262 100644 --- a/plugins/two-node/evals/threat-model-tnf.yaml +++ b/plugins/two-node/evals/threat-model-tnf.yaml @@ -88,7 +88,7 @@ judges: description: Verify that the main threat model report markdown file was generated check: | files = outputs.get("files", {}) - reports = [k for k in files if "THREAT-MODEL" in k and k.endswith(".md")] + reports = [k for k in files if "THREAT" in k.upper() and k.endswith(".md")] if not reports: return (False, "No threat model report file found") return (True, f"Report generated: {reports[0]}") @@ -97,7 +97,7 @@ judges: description: Verify all required report sections are present in the generated report check: | files = outputs.get("files", {}) - reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + reports = {k: v for k, v in files.items() if "THREAT" in k.upper() and k.endswith(".md")} if not reports: return (False, "No report file found") content = list(reports.values())[0] @@ -122,7 +122,7 @@ judges: check: | import re files = outputs.get("files", {}) - reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + reports = {k: v for k, v in files.items() if "THREAT" in k.upper() and k.endswith(".md")} if not reports: return (False, "No report file found") content = list(reports.values())[0] @@ -137,7 +137,7 @@ judges: check: | import re files = outputs.get("files", {}) - reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + reports = {k: v for k, v in files.items() if "THREAT" in k.upper() and k.endswith(".md")} if not reports: return (False, "No report file found") content = list(reports.values())[0] @@ -154,7 +154,7 @@ judges: check: | import re files = outputs.get("files", {}) - reports = {k: v for k, v in files.items() if "THREAT-MODEL" in k and k.endswith(".md")} + reports = {k: v for k, v in files.items() if "THREAT" in k.upper() and k.endswith(".md")} if not reports: return (False, "No report file found") content = list(reports.values())[0] From 39a8caade0154ab228857a9cb372ebff18ebacff Mon Sep 17 00:00:00 2001 From: Douglas Hensel Date: Thu, 18 Jun 2026 07:47:10 -0400 Subject: [PATCH 7/7] fixing linter errors --- plugins/two-node/evals/README.md | 17 +++++++++++------ plugins/two-node/evals/cluster-diagnostic.md | 4 ++++ plugins/two-node/evals/cluster-diagnostic.yaml | 2 +- plugins/two-node/evals/threat-model-tnf.md | 4 +++- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/plugins/two-node/evals/README.md b/plugins/two-node/evals/README.md index 45bfdeb6..ecf3f815 100644 --- a/plugins/two-node/evals/README.md +++ b/plugins/two-node/evals/README.md @@ -36,7 +36,7 @@ The CI workflow is defined in ## Directory Structure -``` +```text evals/ ├── .yaml # Eval config (judges, thresholds, schema) ├── .md # Cached skill analysis @@ -50,27 +50,32 @@ evals/ ## Adding a New Eval 1. **Analyze the skill** — reads SKILL.md, designs judges, writes the eval config - ``` + + ```bash /eval-analyze --skill --config evals/.yaml ``` 2. **Generate scenarios** — creates `input.yaml` + `annotations.yaml` per case - ``` + + ```bash /eval-dataset --config evals/.yaml ``` 3. **Run the eval** — executes the skill against each case, scores with judges, generates HTML report - ``` + + ```bash /eval-run --model claude-opus-4-6 --config evals/.yaml ``` 4. **Review results** — walk through cases, collect human feedback - ``` + + ```bash /eval-review --run-id --config evals/.yaml ``` 5. **(Optional) Optimize** — auto-fix SKILL.md based on judge failures, re-run to verify - ``` + + ```bash /eval-optimize --config evals/.yaml ``` diff --git a/plugins/two-node/evals/cluster-diagnostic.md b/plugins/two-node/evals/cluster-diagnostic.md index 345a131a..eb442663 100644 --- a/plugins/two-node/evals/cluster-diagnostic.md +++ b/plugins/two-node/evals/cluster-diagnostic.md @@ -44,11 +44,13 @@ adds complexity. ## Inputs Each test case has `input.yaml` with: + - `command_input`: Full argument string (e.g., `validate "cordon, drain, shutdown"`, `recovery-guide full-shutdown`) - `mode`: Which mode is being tested (`validate`, `recovery-guide`, `game`) And `annotations.yaml` with expected outcomes: + - `expected_blockers`: List of BLOCKER findings expected (validate mode) - `expected_warnings`: List of WARNING findings expected - `expected_scenario`: Scenario name (recovery-guide mode) @@ -73,11 +75,13 @@ All output is conversational — the skill writes nothing to disk. Judges use ## Quality Criteria **Deterministic** (code-checkable): + - Severity classification matches knowledge base table - Never recommends pcs standby, sequential shutdown, or shutdown -h - Recovery procedures include bash commands and verification steps **LLM judgment** (requires reasoning): + - Response accurately reflects TNF architecture facts - Failure mode explanations reference correct root causes - Recovery procedures match validated bare metal test results diff --git a/plugins/two-node/evals/cluster-diagnostic.yaml b/plugins/two-node/evals/cluster-diagnostic.yaml index 34449944..02ef9263 100644 --- a/plugins/two-node/evals/cluster-diagnostic.yaml +++ b/plugins/two-node/evals/cluster-diagnostic.yaml @@ -194,7 +194,7 @@ judges: if "pcs node standby" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: forbidden.append("pcs node standby recommended") if "shutdown -h" in sec_lower and "never" not in sec_lower and "do not" not in sec_lower: - forbidden.append("shutdown -h 1 recommended") + forbidden.append("shutdown -h recommended") if forbidden: return (False, f"Forbidden recommendations found: {forbidden}") diff --git a/plugins/two-node/evals/threat-model-tnf.md b/plugins/two-node/evals/threat-model-tnf.md index 4dc11d23..39a7df7b 100644 --- a/plugins/two-node/evals/threat-model-tnf.md +++ b/plugins/two-node/evals/threat-model-tnf.md @@ -37,7 +37,7 @@ suggested_judges: description: "Cumulative findings tracker was appended" --- -## Skill Analysis +# Skill Analysis The `threat-model:tnf` skill performs security threat analysis on GitHub PRs affecting the TNF (Two-Node Fencing) OpenShift topology. It combines three approaches: @@ -86,6 +86,7 @@ It also appends to a cumulative findings tracker at `$WORKSPACE/.claude/skills/t ## Quality Criteria A **good** report: + - Correctly identifies all affected DFD elements from the code paths in the PR - Applies STRIDE systematically to each element (all 6 categories for processes, T/I/D for stores and flows) - Assigns accurate severity levels matching MITRE/OWASP standards @@ -94,6 +95,7 @@ A **good** report: - Maps findings to correct MITRE techniques (T1059 for injection, T1552 for credentials, T1611 for container escape) A **bad** report: + - Misses affected DFD elements or assigns wrong elements to code paths - Has incomplete STRIDE matrix (missing categories or missing rationale) - Over/under-rates severity (e.g., calling a minor code quality issue "Critical")