diff --git a/.archon/commands/e2e-echo-command.md b/.archon/commands/e2e-echo-command.md new file mode 100644 index 0000000000..7d67fa3e2c --- /dev/null +++ b/.archon/commands/e2e-echo-command.md @@ -0,0 +1,13 @@ +--- +description: E2E test command — echoes back the user message +argument-hint: +--- + +# E2E Echo Command + +You are a simple echo agent for testing. Your ONLY job is to repeat back the user's message. + +User message: $ARGUMENTS + +Respond with EXACTLY this format and nothing else: +command-echo: diff --git a/.archon/scripts/echo-args.py b/.archon/scripts/echo-args.py new file mode 100644 index 0000000000..a4f565218c --- /dev/null +++ b/.archon/scripts/echo-args.py @@ -0,0 +1,7 @@ +"""Simple script node test — echoes input as JSON (uv/Python runtime).""" +import json +import sys +from datetime import datetime, timezone + +input_val = sys.argv[1] if len(sys.argv) > 1 else "no-input" +print(json.dumps({"echoed": input_val, "timestamp": datetime.now(timezone.utc).isoformat()})) diff --git a/.archon/test-fixtures/mcp/e2e-filesystem.json b/.archon/test-fixtures/mcp/e2e-filesystem.json new file mode 100644 index 0000000000..57e9fad3e4 --- /dev/null +++ b/.archon/test-fixtures/mcp/e2e-filesystem.json @@ -0,0 +1,6 @@ +{ + "filesystem": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-filesystem", "/tmp"] + } +} diff --git a/.archon/workflows/defaults/archon-gsd.yaml b/.archon/workflows/defaults/archon-gsd.yaml new file mode 100644 index 0000000000..b1f7bd31bf --- /dev/null +++ b/.archon/workflows/defaults/archon-gsd.yaml @@ -0,0 +1,1216 @@ +name: archon-gsd +description: | + Use when: User wants rigorous spec-driven development inspired by GSD (Get Shit Done). + Triggers: "gsd", "rigorous dev", "spec driven", "gsd workflow", "get shit done", + "structured development with verification", "full rigor". + Does: Parallel research (4 agents) -> requirements extraction -> interactive discussion -> + plan with AI checker (revision loop) -> execution with progress tracking -> + goal-backward verification + code review -> human UAT -> PR creation. + NOT for: Quick fixes, simple one-off tasks, PR reviews, bug triage. + Use for substantial features that benefit from structured planning and verification. + + Inspired by GSD (Get Shit Done). Takes a feature description, PRD, or GitHub issue and runs + a full spec-driven pipeline with human approval gates at key decision points: + 1. RESEARCH: 4 parallel agents explore stack, features, architecture, and pitfalls + 2. REQUIREMENTS: Structured extraction with unique IDs, tiered priority + 3. DISCUSS: Interactive session to lock decisions into CONTEXT document + 4. PLAN: Detailed task plan + AI plan-checker revision loop (8 dimensions, max 3 cycles) + 5. EXECUTE: Ralph-style fresh-context implementation with per-task validation + 6. VERIFY: Goal-backward verification + code review (parallel) + 7. UAT: Human acceptance testing with iterative fix loop + 8. FINALIZE: Push, create PR, summary + +provider: claude +interactive: true + +nodes: + # ================================================================= + # PHASE 1: PARALLEL RESEARCH + # Four agents explore the codebase from different angles simultaneously. + # Each writes findings to $ARTIFACTS_DIR/research/ for persistence. + # ================================================================= + + - id: research-stack + model: sonnet + context: fresh + prompt: | + # GSD Research Agent: Stack & Technology + + You are one of four parallel research agents. Your focus: **technology stack, + dependencies, and development environment**. + + **User's request**: $ARGUMENTS + + ## Your Mission + + Explore the codebase and produce a focused research report on: + + 1. **Language & Runtime** -- What language(s), runtime version, build tools? + 2. **Package Manager & Dependencies** -- What key deps are installed? Versions? + 3. **Framework & Libraries** -- Web framework, ORM, test framework, etc. + 4. **Dev Environment** -- How to run, test, lint, build? What scripts exist? + 5. **CI/CD** -- Any CI config? What does the pipeline look like? + 6. **Relevance to Request** -- How does the stack constrain or enable what the user wants? + + ## How to Research + + 1. Read CLAUDE.md (or README.md) for project overview + 2. Read package.json / Cargo.toml / pyproject.toml / go.mod (whatever applies) + 3. Check for config files: tsconfig, eslint, prettier, docker, CI configs + 4. Note exact versions of critical dependencies + + ## Output + + Write your findings to `$ARTIFACTS_DIR/research/stack.md`: + + ```bash + mkdir -p "$ARTIFACTS_DIR/research" + ``` + + Format as a structured markdown document with the sections above. + Include exact file paths and version numbers. Be specific, not generic. + + Also output a brief summary (5-10 lines) to stdout for the synthesis agent. + + - id: research-features + model: sonnet + context: fresh + prompt: | + # GSD Research Agent: Features & Existing Functionality + + You are one of four parallel research agents. Your focus: **existing features, + functionality, and how they relate to the user's request**. + + **User's request**: $ARGUMENTS + + ## Your Mission + + Explore the codebase and produce a focused research report on: + + 1. **Existing Features** -- What does the application currently do? + 2. **Related Functionality** -- What existing code is closest to what the user wants? + 3. **Data Model** -- What data structures, DB tables, API endpoints already exist? + 4. **UI Components** -- What UI exists? Pages, components, patterns? + 5. **Extension Points** -- Where can existing code be extended rather than replaced? + 6. **Gaps** -- What's missing that the user's request requires? + + ## How to Research + + 1. Read CLAUDE.md for architecture overview + 2. Search for code related to the user's request (grep for keywords) + 3. Read the most relevant files thoroughly -- note exact function names, types, exports + 4. Check for existing tests that cover related functionality + 5. Map out the data flow for the closest existing feature + + ## Output + + Write your findings to `$ARTIFACTS_DIR/research/features.md`: + + ```bash + mkdir -p "$ARTIFACTS_DIR/research" + ``` + + Format with exact file:line references. For each existing feature found, note: + - Where it lives (file paths) + - What it does (brief description) + - How it could be reused or extended for the new request + + Also output a brief summary (5-10 lines) to stdout for the synthesis agent. + + - id: research-architecture + model: sonnet + context: fresh + prompt: | + # GSD Research Agent: Architecture & Patterns + + You are one of four parallel research agents. Your focus: **code architecture, + design patterns, and structural conventions**. + + **User's request**: $ARGUMENTS + + ## Your Mission + + Explore the codebase and produce a focused research report on: + + 1. **Project Structure** -- Directory layout, module organization, package boundaries + 2. **Design Patterns** -- What patterns are used? (DI, MVC, repository, etc.) + 3. **Coding Conventions** -- Naming, file organization, import patterns, error handling + 4. **Interfaces & Abstractions** -- Key interfaces, type patterns, extension points + 5. **Data Flow** -- How does data move through the system? Request lifecycle? + 6. **Architectural Constraints** -- What rules must new code follow? + + ## How to Research + + 1. Read CLAUDE.md for explicit architecture rules + 2. Study the directory structure (`ls` key directories) + 3. Read 2-3 representative files to understand patterns + 4. Look for interfaces/types that define contracts + 5. Check for dependency injection patterns, service registration, etc. + + ## Output + + Write your findings to `$ARTIFACTS_DIR/research/architecture.md`: + + ```bash + mkdir -p "$ARTIFACTS_DIR/research" + ``` + + Include a "Patterns to Follow" section with concrete code snippets from the codebase + that new code should mirror. Note anti-patterns to avoid. + + Also output a brief summary (5-10 lines) to stdout for the synthesis agent. + + - id: research-pitfalls + model: sonnet + context: fresh + prompt: | + # GSD Research Agent: Pitfalls & Risks + + You are one of four parallel research agents. Your focus: **potential problems, + risks, gotchas, and things that could go wrong**. + + **User's request**: $ARGUMENTS + + ## Your Mission + + Explore the codebase and produce a focused research report on: + + 1. **Known Issues** -- Check git log for recent bug fixes in related areas + 2. **Technical Debt** -- Any TODO/FIXME/HACK comments in relevant code? + 3. **Fragile Areas** -- Code with complex logic, many dependencies, or poor test coverage + 4. **Breaking Change Risk** -- What existing functionality could break? + 5. **Performance Concerns** -- Any hot paths, N+1 queries, large data sets? + 6. **Security Considerations** -- Auth, input validation, data exposure risks + 7. **Testing Gaps** -- Areas with no tests that the change would touch + + ## How to Research + + 1. `git log --oneline -30` for recent changes and bug fixes + 2. `grep -r "TODO\|FIXME\|HACK\|XXX"` in relevant directories + 3. Look for complex functions (long, deeply nested, many parameters) + 4. Check test coverage -- which files have test files, which don't? + 5. Review error handling patterns in the areas that would change + + ## Output + + Write your findings to `$ARTIFACTS_DIR/research/pitfalls.md`: + + ```bash + mkdir -p "$ARTIFACTS_DIR/research" + ``` + + Rank pitfalls by severity (CRITICAL / HIGH / MEDIUM / LOW). + For each, suggest a specific mitigation strategy. + + Also output a brief summary (5-10 lines) to stdout for the synthesis agent. + + # ================================================================= + # RESEARCH SYNTHESIS + # Combines all four research reports into a unified summary. + # ================================================================= + + - id: synthesize-research + model: sonnet + depends_on: [research-stack, research-features, research-architecture, research-pitfalls] + context: fresh + prompt: | + # GSD Research Synthesis + + Four research agents explored the codebase in parallel. Synthesize their findings + into a unified research summary. + + ## Research Outputs + + **Stack Research**: $research-stack.output + **Features Research**: $research-features.output + **Architecture Research**: $research-architecture.output + **Pitfalls Research**: $research-pitfalls.output + + ## Your Task + + 1. Read all four detailed reports from `$ARTIFACTS_DIR/research/`: + - `$ARTIFACTS_DIR/research/stack.md` + - `$ARTIFACTS_DIR/research/features.md` + - `$ARTIFACTS_DIR/research/architecture.md` + - `$ARTIFACTS_DIR/research/pitfalls.md` + + 2. Synthesize into `$ARTIFACTS_DIR/research/SUMMARY.md` with these sections: + + ```markdown + # Research Summary + + ## User's Request + {Restated understanding of what the user wants} + + ## Key Findings + - {Most important discovery from each research area} + + ## What Already Exists + - {Existing code that can be extended -- file paths} + + ## Recommended Approach + - {High-level approach based on all research} + - {Prefer extending existing code over creating new} + + ## Critical Risks + - {Top 3-5 risks with mitigations} + + ## Open Questions + - {Questions that need user input before planning} + ``` + + 3. Output the full summary to stdout so downstream nodes can access it. + + # ================================================================= + # PHASE 2: REQUIREMENTS + # Extract structured requirements from the user's request + research. + # ================================================================= + + - id: requirements + model: sonnet + depends_on: [synthesize-research] + context: fresh + prompt: | + # GSD Requirements Extraction + + Extract structured requirements from the user's request and research findings. + + **User's request**: $ARGUMENTS + **Research summary**: $synthesize-research.output + + ## Your Task + + 1. Read the full research summary from `$ARTIFACTS_DIR/research/SUMMARY.md` + 2. Read any referenced research files for detail + + 3. Write `$ARTIFACTS_DIR/requirements.md` with this structure: + + ```markdown + # Requirements + + ## Overview + {One paragraph summary of what we're building and why} + + ## V1 Requirements (Must Have) + | ID | Requirement | Source | Acceptance Criteria | + |----|------------|--------|---------------------| + | REQ-01 | {requirement} | {user request / research finding} | {testable criterion} | + | REQ-02 | ... | ... | ... | + + ## V2 Requirements (Future) + | ID | Requirement | Rationale for Deferral | + |----|------------|----------------------| + | REQ-F01 | {requirement} | {why not in v1} | + + ## Out of Scope + - {Explicitly excluded items and why} + + ## Constraints + - {Technical constraints from stack research} + - {Architectural constraints from architecture research} + - {Risk constraints from pitfalls research} + + ## Success Criteria + - [ ] {Specific, testable criterion mapped to REQ-XX} + - [ ] All validation passes + - [ ] No regressions in existing tests + ``` + + 4. Present the requirements to the user. Ask them to review: + - Are the V1 requirements correct and complete? + - Should anything move between V1 / V2 / Out of Scope? + - Are the acceptance criteria testable? + - Any missing constraints? + + # ================================================================= + # GATE: Requirements Approval + # ================================================================= + + - id: requirements-gate + approval: + message: | + Review the requirements above. + - Are the V1 requirements correct and complete? + - Should anything move between tiers? + - Approve to proceed to discussion phase, or provide feedback to revise. + capture_response: true + depends_on: [requirements] + + # ================================================================= + # PHASE 3: DISCUSS — Lock Decisions + # Interactive session where user and AI discuss approach and lock + # key decisions into a CONTEXT document. + # ================================================================= + + - id: discuss + depends_on: [requirements-gate] + loop: + prompt: | + # GSD Discussion Phase + + You are in an interactive discussion to lock down key decisions before planning. + Your goal: resolve all ambiguity and record decisions in a CONTEXT document. + + **User's request**: $ARGUMENTS + **User's requirements feedback**: $requirements-gate.output + **User's latest input**: $LOOP_USER_INPUT + + --- + + ## If this is the FIRST iteration (no user input yet): + + 1. Read the requirements: `$ARTIFACTS_DIR/requirements.md` + 2. Read the research summary: `$ARTIFACTS_DIR/research/SUMMARY.md` + 3. If the user provided feedback at the requirements gate, incorporate it: + update `$ARTIFACTS_DIR/requirements.md` with any changes. + + 4. Create `$ARTIFACTS_DIR/context.md` with this initial structure: + + ```markdown + # Context & Decisions + + ## Locked Decisions + {None yet -- decisions will be recorded as the discussion progresses} + + ## Open Questions + - {Question 1 from research summary} + - {Question 2} + + ## Deferred Ideas + {Ideas acknowledged but explicitly deferred to future work} + ``` + + 5. Present the open questions and key architectural decisions that need resolution. + For each decision, present options with concrete tradeoffs (not just "option A vs B" + but what each means for the codebase with file:line references). + + ## If the user has provided input (subsequent iterations): + + 1. Read `$ARTIFACTS_DIR/context.md` and `$ARTIFACTS_DIR/requirements.md` + 2. Process the user's input: + - If they made a decision -> Record it under "Locked Decisions" with rationale + - If they asked a question -> Research the answer with evidence from the codebase + - If they deferred something -> Move it to "Deferred Ideas" + - If they changed requirements -> Update `$ARTIFACTS_DIR/requirements.md` + + 3. Update `$ARTIFACTS_DIR/context.md` with new decisions. + + 4. Present remaining open questions, or if all questions are resolved: + + ``` + ## All Decisions Locked + + Locked decisions: + - D-01: {decision} -- {rationale} + - D-02: {decision} -- {rationale} + + Open questions: None remaining. + + Say "ready" to proceed to the planning phase. + ``` + + **CRITICAL**: NEVER output CONTEXT_LOCKED unless the user's + LATEST message contains an EXPLICIT phrase like "ready", "proceed", "let's plan", + or "go ahead". Questions, feedback, and decisions are NOT approval signals. + + until: CONTEXT_LOCKED + max_iterations: 15 + interactive: true + gate_message: | + Answer the questions above, make decisions, ask for more exploration, + or say "ready" when all decisions are locked and you want to proceed to planning. + + # ================================================================= + # PHASE 4a: PLAN — Create Detailed Plan + # Creates a GSD-style task plan with verification criteria. + # ================================================================= + + - id: create-plan + model: sonnet + depends_on: [discuss] + context: fresh + prompt: | + # GSD Plan Creation + + Create a detailed implementation plan from the locked requirements and decisions. + This plan will be verified by a plan-checker agent and then executed by an + implementation agent with NO prior context -- it must be completely self-contained. + + **User's request**: $ARGUMENTS + + ## Step 1: Load All Context + + Read these files (they contain all decisions from prior phases): + - `$ARTIFACTS_DIR/requirements.md` -- what to build + - `$ARTIFACTS_DIR/context.md` -- locked decisions + - `$ARTIFACTS_DIR/research/SUMMARY.md` -- research findings + - `$ARTIFACTS_DIR/research/architecture.md` -- patterns to follow + - `$ARTIFACTS_DIR/research/pitfalls.md` -- risks to mitigate + + Also read CLAUDE.md for project conventions. + + ## Step 2: Read Every File You'll Reference + + Before writing the plan, read EVERY file you plan to reference: + - Files that will be modified + - Pattern files to follow + - Test files for test patterns + - Config files that affect the work + + Verify: file paths exist, function names are correct, patterns match reality. + + ## Step 3: Write the Plan + + Write to `$ARTIFACTS_DIR/plan.md` using this structure: + + ```markdown + # Implementation Plan + + ## Overview + {What we're building, why, and key decisions from CONTEXT} + + ## Requirements Coverage + | REQ ID | Requirement | Covered By Task(s) | + |--------|------------|-------------------| + | REQ-01 | {text} | Task 1, Task 3 | + + ## Task List + + ### Task 1: {ACTION VERB} {specific target} + **Type**: auto + **Files**: {files to create or modify} + **Read First**: {files to read for context/patterns} + **Action**: {Concrete, specific implementation steps -- no vague language. + Include exact function signatures, type definitions, import paths.} + **Verify**: {Specific command to verify this task works} + **Acceptance Criteria**: + - {grep-verifiable or test-verifiable condition} + **Requirements**: REQ-01, REQ-03 + + ### Task 2: ... + + ## Wave Grouping + | Wave | Tasks | Rationale | + |------|-------|-----------| + | 1 | Task 1, Task 2 | {No dependencies between them} | + | 2 | Task 3 | {Depends on Task 1 output} | + + ## Testing Strategy + | Test File | Test Cases | Validates | + |-----------|-----------|-----------| + | {path} | {cases} | REQ-XX | + + ## Validation Commands + 1. Type check: {command} + 2. Lint: {command} + 3. Tests: {command} + 4. Full validation: {command} + + ## Risks & Mitigations + | Risk | Impact | Mitigation | From | + |------|--------|------------|------| + | {risk} | HIGH/MED/LOW | {strategy} | pitfalls research | + ``` + + ## Plan Quality Rules + + - Every V1 requirement MUST be covered by at least one task + - Every task MUST have a Verify command and Acceptance Criteria + - No vague actions: "align X with Y", "update as needed", "ensure consistency" are BANNED + - Every file path MUST be verified by reading the file + - Tasks should be ordered by dependency (wave grouping) + - Each task must be completable in a single agent session + + ## Step 4: Source Coverage Audit + + After writing the plan, verify coverage: + - Every REQ-XX from requirements.md has a covering task + - Every locked decision from context.md is reflected in the plan + - Every critical risk from pitfalls has a mitigation in the plan + + If ANY item is uncovered, add tasks to cover it. Do NOT proceed with gaps. + + ## Step 5: Output + + Output the plan summary: task count, wave count, requirements coverage percentage. + + # ================================================================= + # PHASE 4b: PLAN CHECKER — Verify and Revise + # AI-driven plan verification loop inspired by GSD's 8-dimension checker. + # Each iteration: check the plan, if issues found revise it, re-check. + # Max 3 check-revise cycles. Signals completion when plan passes. + # ================================================================= + + - id: check-and-revise + depends_on: [create-plan] + loop: + prompt: | + # GSD Plan Checker & Reviser + + You are a plan quality gatekeeper. Your job: verify the plan against 8 dimensions, + revise if issues are found, and signal completion only when the plan passes. + + **User's request**: $ARGUMENTS + + ## Step 1: Load State + + Read the plan and check history: + - `$ARTIFACTS_DIR/plan.md` -- the current plan + - `$ARTIFACTS_DIR/requirements.md` -- requirements to verify against + - `$ARTIFACTS_DIR/context.md` -- locked decisions to verify against + - `$ARTIFACTS_DIR/plan-check.md` -- previous check results (may not exist yet) + - Read CLAUDE.md for project conventions + + ## Step 2: Check Against 8 Dimensions + + Evaluate the plan on each dimension. For each, assign PASS or FAIL with evidence: + + ### Dimension 1: Requirement Coverage + Every V1 REQ-XX has at least one task covering it. No orphan requirements. + + ### Dimension 2: Task Completeness + Every task has: Files, Read First, Action, Verify, Acceptance Criteria. + No empty or placeholder fields. + + ### Dimension 3: Task Specificity + No vague actions. Ban: "align", "ensure", "update as needed", "make consistent", + "refactor", "clean up" without concrete steps. Every action must specify WHAT to + write/change with enough detail for an agent with zero context. + + ### Dimension 4: Dependency Ordering + Tasks are sequenced correctly. No task references output from a later task. + Wave grouping respects dependencies. + + ### Dimension 5: File Scope + No excessive overlap between tasks modifying the same file. + If multiple tasks touch one file, the ordering is clear and non-conflicting. + + ### Dimension 6: Context Fit + Each task is completable in a single agent session. No mega-tasks that + would require splitting. Estimated complexity is reasonable. + + ### Dimension 7: Gap Detection + No missing implementation steps. The plan, executed in order, produces a + working feature. Look for: missing imports, missing type definitions, + missing test setup, missing config changes. + + ### Dimension 8: Verification Coverage + Every task has a concrete Verify command. The testing strategy covers + all requirements. Acceptance criteria are grep-verifiable or test-verifiable. + + ## Step 3: Write Check Results + + Write results to `$ARTIFACTS_DIR/plan-check.md`: + + ```markdown + # Plan Check - Iteration {N} + + ## Results + | Dimension | Status | Evidence | + |-----------|--------|----------| + | Requirement Coverage | PASS/FAIL | {detail} | + | ... | ... | ... | + + ## Issues Found + - severity: blocker/warning + dimension: {N} + description: {what's wrong} + task: {which task} + fix_hint: {how to fix} + + ## Verdict: PASSED / ISSUES FOUND + ``` + + ## Step 4: Act on Results + + **If all 8 dimensions PASS:** + - Write "## VERIFICATION PASSED" to plan-check.md + - Output: "Plan verified across all 8 dimensions." + - Signal: PLAN_VERIFIED + + **If any dimension FAILS:** + - Read the plan file + - For each issue, apply the fix directly to `$ARTIFACTS_DIR/plan.md` + - Log what was changed + - Do NOT signal completion -- the next iteration will re-check + + **Stall detection:** If the issue count is not decreasing compared to + the previous check (read from plan-check.md), note this in the output. + After 3 cycles without improvement, signal PLAN_VERIFIED + anyway with a warning about remaining issues. + + until: PLAN_VERIFIED + max_iterations: 6 + fresh_context: true + + # ================================================================= + # GATE: Plan Approval + # Human reviews the verified plan before execution begins. + # ================================================================= + + - id: plan-gate + approval: + message: | + The plan has been verified by the AI plan-checker. + Review the plan at $ARTIFACTS_DIR/plan.md and the check results at $ARTIFACTS_DIR/plan-check.md. + Approve to begin execution, or provide feedback to revise. + capture_response: true + depends_on: [check-and-revise] + + # ================================================================= + # PHASE 5a: EXECUTE — Setup + # Incorporate any human feedback from the plan gate, prepare environment. + # ================================================================= + + - id: execute-setup + depends_on: [plan-gate] + bash: | + set -e + + # Read plan + PLAN_FILE="$ARTIFACTS_DIR/plan.md" + if [ ! -f "$PLAN_FILE" ]; then + echo "ERROR: No plan file found at $PLAN_FILE" + exit 1 + fi + + # Install dependencies if needed + if [ -f "bun.lock" ] || [ -f "bun.lockb" ]; then + echo "Installing dependencies..." + bun install --frozen-lockfile 2>&1 | tail -3 + elif [ -f "package-lock.json" ]; then + npm ci 2>&1 | tail -3 + elif [ -f "yarn.lock" ]; then + yarn install --frozen-lockfile 2>&1 | tail -3 + elif [ -f "pnpm-lock.yaml" ]; then + pnpm install --frozen-lockfile 2>&1 | tail -3 + fi + + # Initialize progress tracking + if [ ! -f "$ARTIFACTS_DIR/progress.txt" ]; then + echo "# GSD Execution Progress" > "$ARTIFACTS_DIR/progress.txt" + echo "Started: $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> "$ARTIFACTS_DIR/progress.txt" + echo "---" >> "$ARTIFACTS_DIR/progress.txt" + fi + + echo "BRANCH=$(git branch --show-current)" + echo "GIT_ROOT=$(git rev-parse --show-toplevel)" + echo "PLAN_FILE=$PLAN_FILE" + + echo "=== PLAN_START ===" + cat "$PLAN_FILE" + echo "" + echo "=== PLAN_END ===" + + TASK_COUNT=$(grep -c "^### Task [0-9]" "$PLAN_FILE" || true) + echo "TASK_COUNT=${TASK_COUNT:-0}" + + # Show human feedback if any + echo "=== HUMAN_FEEDBACK ===" + echo "$PLAN_GATE_OUTPUT" + echo "=== END_FEEDBACK ===" + + # ================================================================= + # PHASE 5b: EXECUTE — Task-by-Task Loop (Ralph Pattern) + # Fresh context each iteration. Reads plan from disk. One task per + # iteration. Validates before committing. Tracks progress. + # ================================================================= + + - id: execute + depends_on: [execute-setup] + idle_timeout: 600000 + model: claude-opus-4-6[1m] + loop: + prompt: | + # GSD Execution Agent + + You are an autonomous coding agent in a FRESH session -- no memory of previous + iterations. Your job: read the plan, implement ONE task, validate, commit, track. + + **Golden Rule**: If validation fails, fix it before committing. Never commit broken code. + + --- + + ## Phase 0: CONTEXT -- Load State + + Setup context: + $execute-setup.output + + **User's original request**: $ARGUMENTS + + --- + + ### 0.1 Read Current State (from disk -- not from context above) + + The context above is a snapshot from before the loop started. Previous iterations + may have changed things. **You MUST re-read from disk:** + + 1. Read the plan file at `$ARTIFACTS_DIR/plan.md` + 2. Read progress tracking at `$ARTIFACTS_DIR/progress.txt` + 3. Read CLAUDE.md for project conventions + 4. Read `$ARTIFACTS_DIR/context.md` for locked decisions + 5. Check git state: `git log --oneline -10` and `git status` + + ### 0.2 Determine What's Done + + Cross-reference: + - Tasks marked COMPLETED in progress.txt + - Git commits from previous iterations + - Files that already exist / have been modified + + --- + + ## Phase 1: SELECT -- Pick Next Task + + From the plan, identify tasks by `### Task N:` headers. + Find the first task NOT marked COMPLETED in progress.txt. + + **If ALL tasks are complete** -> Skip to Phase 5 (Completion). + + Announce: + ``` + -- Task Selected ------------------------------------------------ + Task: {N} -- {task title} + Action: {CREATE / UPDATE} + File: {file path} + ----------------------------------------------------------------- + ``` + + --- + + ## Phase 2: IMPLEMENT -- Execute the Task + + 1. Read every file listed in "Read First" + 2. Read the file you're about to change (if it exists) + 3. Make changes following the plan EXACTLY + 4. Follow patterns from the plan's "Patterns to Follow" section + 5. Type-check after each file change if applicable + + --- + + ## Phase 3: VALIDATE -- Verify the Task + + Run the task's specific Verify command from the plan. + Then run the project's general validation: + + ```bash + # Task-specific verify first, then general validation + bun run type-check 2>&1 || true + bun run lint 2>&1 || true + bun run test 2>&1 || true + ``` + + If validation fails: diagnose, fix, re-run (up to 3 attempts). + If unfixable after 3 attempts: note in progress tracking, do NOT commit broken code. + + --- + + ## Phase 4: COMMIT -- Save Changes + + ```bash + git add -A + git diff --cached --stat + git commit -m "$(cat <<'COMMITEOF' + {type}({scope}): {task description} + + GSD Task {N}: {brief details} + Requirements: {REQ-XX IDs covered} + COMMITEOF + )" + ``` + + Update progress tracking in `$ARTIFACTS_DIR/progress.txt`: + ``` + ## Task {N}: {title} -- COMPLETED + Date: {ISO date} + Files: {list of files changed} + Commit: {short hash} + Verify: {PASS/FAIL} + --- + ``` + + --- + + ## Phase 5: COMPLETE -- Check All Tasks + + If ALL tasks are done: + 1. Run full validation suite + 2. Push: `git push -u origin HEAD` + 3. Write final status to progress.txt + 4. Signal: ALL_TASKS_COMPLETE + + If tasks remain: report status and end normally. The loop starts a fresh iteration. + + until: ALL_TASKS_COMPLETE + max_iterations: 20 + fresh_context: true + + # ================================================================= + # PHASE 6a: VERIFY -- Goal-Backward Verification + # Verifies the implementation achieves the GOAL, not just that tasks + # were completed. This is GSD's key insight: verify from the goal + # backward, not from the task list forward. + # ================================================================= + + - id: verify-goals + model: sonnet + depends_on: [execute] + context: fresh + prompt: | + # GSD Goal-Backward Verification + + You are a verification agent. Your job is NOT to check if tasks were completed -- + it's to verify that the GOAL was achieved. Work backward from the requirements, + not forward from the task list. + + **User's original request**: $ARGUMENTS + + ## Step 1: Load Context + + Read these files: + - `$ARTIFACTS_DIR/requirements.md` -- what we set out to build + - `$ARTIFACTS_DIR/context.md` -- decisions that constrain the solution + - `$ARTIFACTS_DIR/plan.md` -- what was planned + - `$ARTIFACTS_DIR/progress.txt` -- what was executed + - CLAUDE.md for project conventions + + ## Step 2: Three-Level Artifact Verification + + For EACH file that was created or modified (from progress.txt): + + ### Level 1: EXISTS + Does the file exist? `test -f {path}` + + ### Level 2: SUBSTANTIVE + Is it a real implementation, not a stub? + - Read the file + - Check: minimum meaningful content, expected patterns present + - No placeholder TODOs, no empty function bodies, no "implement me" comments + + ### Level 3: WIRED + Is it connected to the rest of the system? + - Is it imported by other code? `grep -r "import.*{name}" --include="*.ts"` + - Is it registered/configured where needed? + - Can the feature be reached by a user action? + + Produce a status matrix: + ``` + | File | Exists | Substantive | Wired | Status | + |------|--------|-------------|-------|--------| + | {path} | YES/NO | YES/NO | YES/NO | VERIFIED / ORPHANED / STUB / MISSING | + ``` + + ## Step 3: Requirement Verification + + For EACH V1 requirement in requirements.md: + - Is it satisfied by the implementation? (Read the actual code, don't trust progress.txt) + - Can you prove it with a specific test or grep? + + ## Step 4: Behavioral Verification + + Run the full test suite: + ```bash + bun run validate 2>&1 || (bun run type-check && bun run lint && bun run test) + ``` + + ## Step 5: Write Verification Report + + Write to `$ARTIFACTS_DIR/verification.md`: + + ```markdown + # Verification Report + + ## Goal Achievement + | Requirement | Status | Evidence | + |-------------|--------|----------| + | REQ-01 | VERIFIED / PARTIAL / MISSING | {proof} | + + ## Artifact Matrix + {The status matrix from Step 2} + + ## Behavioral Tests + - Test suite: PASS / FAIL + - Type check: PASS / FAIL + - Lint: PASS / FAIL + + ## Gaps Found + {List any gaps, or "No gaps found"} + + ## Verdict: PASSED / GAPS_FOUND + ``` + + Output the full verification report to stdout. + + # ================================================================= + # PHASE 6b: CODE REVIEW (parallel with goal verification) + # ================================================================= + + - id: code-review + model: sonnet + depends_on: [execute] + context: fresh + prompt: | + # GSD Code Review + + Review all code changes for quality, security, and convention compliance. + + **User's original request**: $ARGUMENTS + + ## Step 1: Gather Changes + + ```bash + git log --oneline --no-merges $(git merge-base HEAD $BASE_BRANCH)..HEAD + git diff --stat $(git merge-base HEAD $BASE_BRANCH)..HEAD + git diff $(git merge-base HEAD $BASE_BRANCH)..HEAD + ``` + + Read CLAUDE.md for project conventions. + Read `$ARTIFACTS_DIR/plan.md` for intent context. + + ## Step 2: Review Each Changed File + + For each file in the diff: + + 1. **Convention Compliance** -- Does it follow patterns from CLAUDE.md? + 2. **Type Safety** -- Proper types? No unnecessary `any`? Correct interfaces? + 3. **Error Handling** -- Errors caught and handled? No silent swallowing? + 4. **Security** -- Input validation? No injection risks? Auth checks present? + 5. **Testing** -- New code has tests? Tests are meaningful (not just coverage)? + 6. **Performance** -- No N+1 queries? No unnecessary work? Efficient algorithms? + 7. **Naming** -- Clear, consistent naming following project conventions? + + ## Step 3: Fix Critical Issues + + If you find CRITICAL or HIGH severity issues: + - Fix them directly (edit the files) + - Run validation after fixes + - Commit fixes: + ```bash + git add -A && git commit -m "fix: address code review findings" 2>/dev/null || true + ``` + + ## Step 4: Write Review Report + + Write to `$ARTIFACTS_DIR/review.md`: + + ```markdown + # Code Review Report + + ## Summary + - Files reviewed: {count} + - Issues found: {count by severity} + - Issues fixed: {count} + + ## Findings + | Severity | File | Issue | Status | + |----------|------|-------|--------| + | CRITICAL | {path:line} | {description} | FIXED / REMAINING | + | HIGH | ... | ... | ... | + | MEDIUM | ... | ... | ... | + + ## Convention Compliance + {Assessment of CLAUDE.md adherence} + + ## Recommendation + {READY / NEEDS_FIXES with specific remaining items} + ``` + + Output the full review report to stdout. + + # ================================================================= + # GATE: User Acceptance Testing (UAT) + # Human reviews verification + code review results and tests manually. + # ================================================================= + + - id: uat-gate + approval: + message: | + Goal verification and code review are complete. + Review the reports at: + - $ARTIFACTS_DIR/verification.md (goal-backward verification) + - $ARTIFACTS_DIR/review.md (code review) + + Test the implementation yourself. Approve if satisfied, or describe issues to fix. + capture_response: true + depends_on: [verify-goals, code-review] + + # ================================================================= + # PHASE 7: UAT FIX LOOP + # Address human feedback from UAT. Iterates until human approves. + # ================================================================= + + - id: fix-uat + depends_on: [uat-gate] + loop: + prompt: | + # GSD UAT Fix Loop + + The human has tested the implementation and provided feedback. + + **Human's feedback**: $LOOP_USER_INPUT + **UAT gate response**: $uat-gate.output + + --- + + ## Step 1: Read Context + + - Read `$ARTIFACTS_DIR/plan.md` for original intent + - Read `$ARTIFACTS_DIR/requirements.md` for acceptance criteria + - Read CLAUDE.md for conventions + + ## Step 2: Process Feedback + + **If there is no user feedback yet** (first iteration): + - Read `$ARTIFACTS_DIR/verification.md` and `$ARTIFACTS_DIR/review.md` + - Present a summary of the verification and review results + - If the UAT gate response contains specific feedback, treat it as the first round + - If the gate response is just "approved" or similar, signal completion: + UAT_PASSED + - Otherwise, address the feedback and report what was fixed + + **If the user EXPLICITLY approved** ("approved", "looks good", "ship it"): + - Output: "UAT passed. Proceeding to finalize." + - Signal: UAT_PASSED + + **If the user provided specific issues:** + 1. Read the relevant files + 2. Fix each issue + 3. Run validation: + ```bash + bun run validate 2>&1 || (bun run type-check && bun run lint && bun run test) + ``` + 4. Commit fixes: + ```bash + git add -A && git commit -m "$(cat <<'EOF' + fix: address UAT feedback + + Changes: + - {fix 1} + - {fix 2} + EOF + )" 2>/dev/null || true + ``` + + **CRITICAL**: NEVER emit UAT_PASSED unless the user's + latest message EXPLICITLY approves. Bug reports and feedback are NOT approval. + + ## Step 3: Report + + ``` + ## UAT Feedback Addressed + + Changes made: + - {fix 1} + - {fix 2} + + Validation: {PASS / FAIL} + + Test again and approve, or provide more feedback. + ``` + until: UAT_PASSED + max_iterations: 10 + interactive: true + gate_message: | + Test the fixes. Approve if satisfied, or describe remaining issues. + + # ================================================================= + # PHASE 8: FINALIZE — Push, PR, Summary + # ================================================================= + + - id: finalize + model: sonnet + depends_on: [fix-uat] + context: fresh + prompt: | + # GSD Finalize + + The implementation has passed UAT. Push changes and create a PR. + + **User's original request**: $ARGUMENTS + + ## Step 1: Final Validation + + ```bash + bun run validate 2>&1 || (bun run type-check && bun run lint && bun run test && bun run format:check) + ``` + + ## Step 2: Push + + ```bash + git push -u origin HEAD 2>&1 || true + ``` + + ## Step 3: Gather Summary Data + + ```bash + git log --oneline --no-merges $(git merge-base HEAD $BASE_BRANCH)..HEAD + git diff --stat $(git merge-base HEAD $BASE_BRANCH)..HEAD + ``` + + Read these for the PR body: + - `$ARTIFACTS_DIR/requirements.md` + - `$ARTIFACTS_DIR/plan.md` + - `$ARTIFACTS_DIR/verification.md` + - `$ARTIFACTS_DIR/review.md` + - `$ARTIFACTS_DIR/progress.txt` + + ## Step 4: Create PR + + ```bash + gh pr view HEAD --json url 2>/dev/null || echo "NO_PR" + ``` + + If no PR exists, check for a PR template: + ```bash + cat .github/pull_request_template.md 2>/dev/null || echo "NO_TEMPLATE" + ``` + + Create the PR with `gh pr create --draft --base $BASE_BRANCH`. + The body should include: + - Summary of what was built + - Requirements coverage table + - Verification results + - Key decisions from context.md + - Files changed summary + + Use a HEREDOC for the body. + + ## Step 5: Output Final Report + + ``` + =============================================================== + GSD WORKFLOW -- COMPLETE + =============================================================== + + Feature: {from plan} + Branch: {branch name} + PR: {url} + + -- Requirements Coverage ------------------------------------------ + {table from verification.md} + + -- Tasks Completed ------------------------------------------------ + {from progress.txt} + + -- Commits --------------------------------------------------------- + {git log output} + + -- Files Changed --------------------------------------------------- + {git diff --stat output} + + -- Verification ---------------------------------------------------- + Goal verification: {PASSED/GAPS} + Code review: {READY/NEEDS_FIXES} + UAT: PASSED + + -- Artifacts ------------------------------------------------------- + Research: $ARTIFACTS_DIR/research/ + Requirements: $ARTIFACTS_DIR/requirements.md + Context: $ARTIFACTS_DIR/context.md + Plan: $ARTIFACTS_DIR/plan.md + Plan Check: $ARTIFACTS_DIR/plan-check.md + Progress: $ARTIFACTS_DIR/progress.txt + Verification: $ARTIFACTS_DIR/verification.md + Code Review: $ARTIFACTS_DIR/review.md + =============================================================== + ``` diff --git a/.archon/workflows/e2e-all-nodes.yaml b/.archon/workflows/e2e-all-nodes.yaml index a3962b9740..cf534d3a05 100644 --- a/.archon/workflows/e2e-all-nodes.yaml +++ b/.archon/workflows/e2e-all-nodes.yaml @@ -1,8 +1,9 @@ # E2E smoke test — all node types -# Verifies: bash, prompt, script, structured output, model override, $nodeId.output refs +# Verifies: bash, prompt, script (bun), structured output, model override, $nodeId.output refs name: e2e-all-nodes description: "Comprehensive E2E test exercising bash, prompt, script, and structured output nodes." provider: claude +model: haiku nodes: # 1. Bash node — no AI, runs shell, stdout captured as output @@ -13,14 +14,10 @@ nodes: - id: prompt-simple prompt: "The bash node returned: $bash-check.output — confirm you received it by saying 'received'. Say nothing else." depends_on: [bash-check] + allowed_tools: [] + idle_timeout: 60000 - # 3. Prompt with model override — verifies model selection - - id: prompt-haiku - prompt: "Say 'haiku-ok' and nothing else." - model: haiku - depends_on: [bash-check] - - # 4. Structured output node — verifies output_format translation + # 3. Structured output node — verifies output_format translation - id: structured prompt: "Classify the text 'hello world' as either 'greeting' or 'math'." output_format: @@ -32,20 +29,25 @@ nodes: required: ["category"] additionalProperties: false depends_on: [prompt-simple] + allowed_tools: [] + idle_timeout: 60000 - # 5. Bash node using $nodeId.output from structured node + # 4. Bash node using $nodeId.output from structured node - id: bash-read-output bash: "echo 'Structured output category: $structured.output'" depends_on: [structured] - # 6. Script node (bun runtime) — verifies script execution - - id: script-echo + # 5. Script node (bun runtime) — verifies script execution + - id: script-bun script: echo-args runtime: bun depends_on: [bash-check] + timeout: 30000 - # 7. Prompt with effort control — verifies effort passes through to SDK + # 6. Prompt with effort control — verifies effort passes through to SDK - id: prompt-effort prompt: "Say 'effort-ok' and nothing else." effort: low depends_on: [bash-check] + allowed_tools: [] + idle_timeout: 60000 diff --git a/.archon/workflows/e2e-claude-smoke.yaml b/.archon/workflows/e2e-claude-smoke.yaml index e4b0f776a4..9b5c3a5295 100644 --- a/.archon/workflows/e2e-claude-smoke.yaml +++ b/.archon/workflows/e2e-claude-smoke.yaml @@ -1,13 +1,19 @@ # E2E smoke test — Claude provider -# Verifies: provider selection, sendQuery, structured output, tool use +# Verifies: provider selection, sendQuery, structured output, tool use, +# command node, workflow-level model, node-level model override name: e2e-claude-smoke -description: "E2E smoke test for Claude provider. Runs a simple prompt + structured output node." +description: "E2E smoke test for Claude provider. Tests prompt, structured output, tool use, command node, and model overrides." provider: claude +model: haiku nodes: + # 1. Simple prompt — verifies basic sendQuery - id: simple prompt: "What is 2+2? Answer with just the number, nothing else." + allowed_tools: [] + idle_timeout: 60000 + # 2. Structured output — verifies output_format translation - id: structured prompt: "Classify this input as 'math' or 'text': '2+2=4'" output_format: @@ -16,8 +22,26 @@ nodes: category: type: string enum: ["math", "text"] + required: ["category"] + additionalProperties: false + allowed_tools: [] + idle_timeout: 60000 depends_on: [simple] + # 3. Tool use — verifies agent can use tools - id: tool-use - prompt: "Read the file packages/providers/package.json and tell me the package name. Answer with just the name." + prompt: "Read the file package.json and tell me the 'name' field value. Answer with just the name, nothing else." + allowed_tools: [Read] + idle_timeout: 60000 depends_on: [simple] + + # 4. Command node — verifies command file loading + - id: command-test + command: e2e-echo-command + idle_timeout: 60000 + depends_on: [simple] + + # 5. Bash node reads structured output field + - id: verify-structured + bash: "echo 'category=$structured.output.category'" + depends_on: [structured] diff --git a/.archon/workflows/e2e-codex-smoke.yaml b/.archon/workflows/e2e-codex-smoke.yaml index 6650f92215..b8d2025311 100644 --- a/.archon/workflows/e2e-codex-smoke.yaml +++ b/.archon/workflows/e2e-codex-smoke.yaml @@ -3,10 +3,12 @@ name: e2e-codex-smoke description: "E2E smoke test for Codex provider. Runs a simple prompt + structured output node." provider: codex +model: gpt-5.1-codex-mini nodes: - id: simple prompt: "What is 2+2? Answer with just the number, nothing else." + idle_timeout: 60000 - id: structured prompt: "Classify this input as 'math' or 'text': '2+2=4'. Return JSON only." @@ -18,4 +20,5 @@ nodes: enum: ["math", "text"] required: ["category"] additionalProperties: false + idle_timeout: 60000 depends_on: [simple] diff --git a/.archon/workflows/e2e-deterministic.yaml b/.archon/workflows/e2e-deterministic.yaml new file mode 100644 index 0000000000..f4a55ae766 --- /dev/null +++ b/.archon/workflows/e2e-deterministic.yaml @@ -0,0 +1,56 @@ +# E2E smoke test — deterministic nodes (no AI, no API calls) +# Verifies: bash nodes, script nodes (bun + uv), $nodeId.output substitution, +# when conditions, trigger_rule join semantics +name: e2e-deterministic +description: "Pure DAG engine test. Exercises bash, script (bun/uv), conditions, and trigger rules with zero API calls." + +nodes: + # Layer 0 — parallel deterministic nodes + - id: bash-echo + bash: "echo '{\"status\":\"ok\",\"value\":42}'" + + - id: script-bun + script: echo-args + runtime: bun + timeout: 30000 + + - id: script-python + script: echo-args + runtime: uv + timeout: 30000 + + # Layer 1 — test $nodeId.output substitution from bash + - id: bash-read-output + bash: "echo 'upstream-status: $bash-echo.output'" + depends_on: [bash-echo] + + # Layer 1 — conditional branches (only one should run) + - id: branch-true + bash: "echo 'branch-true-ran'" + depends_on: [bash-echo] + when: "$bash-echo.output.status == 'ok'" + + - id: branch-false + bash: "echo 'branch-false-ran'" + depends_on: [bash-echo] + when: "$bash-echo.output.status == 'fail'" + + # Layer 2 — trigger_rule merge (one_success: branch-false will be skipped) + - id: merge-node + bash: "echo 'merge-ok: true=$branch-true.output false=$branch-false.output'" + depends_on: [branch-true, branch-false] + trigger_rule: one_success + + # Layer 3 — final verification: collect all outputs + - id: verify-all + bash: | + echo '=== E2E Deterministic Results ===' + echo 'bash-echo: $bash-echo.output' + echo 'script-bun: $script-bun.output' + echo 'script-python: $script-python.output' + echo 'bash-read-output: $bash-read-output.output' + echo 'branch-true: $branch-true.output' + echo 'merge-node: $merge-node.output' + echo '=== ALL PASSED ===' + depends_on: [bash-read-output, script-bun, script-python, merge-node] + trigger_rule: all_success diff --git a/.archon/workflows/e2e-mixed-providers.yaml b/.archon/workflows/e2e-mixed-providers.yaml index 6922056e50..2b2a86ec87 100644 --- a/.archon/workflows/e2e-mixed-providers.yaml +++ b/.archon/workflows/e2e-mixed-providers.yaml @@ -5,20 +5,27 @@ description: "Tests Claude and Codex providers in the same workflow with cross-p # Default provider is claude provider: claude +model: haiku nodes: # 1. Claude node — default provider - id: claude-node prompt: "Say 'claude-ok' and nothing else." + allowed_tools: [] + idle_timeout: 60000 # 2. Codex node — provider override - id: codex-node prompt: "Say 'codex-ok' and nothing else." provider: codex + model: gpt-5.1-codex-mini + idle_timeout: 60000 # 3. Claude node reads Codex output — cross-provider ref - id: claude-reads-codex prompt: "The codex node said: '$codex-node.output'. Confirm you received it by saying 'cross-provider-ok'. Say nothing else." + allowed_tools: [] + idle_timeout: 60000 depends_on: [codex-node] # 4. Bash node verifies both outputs diff --git a/.archon/workflows/e2e-skills-mcp.yaml b/.archon/workflows/e2e-skills-mcp.yaml new file mode 100644 index 0000000000..c6f7f0e087 --- /dev/null +++ b/.archon/workflows/e2e-skills-mcp.yaml @@ -0,0 +1,52 @@ +# E2E smoke test — Claude advanced features (skills, MCP, effort, systemPrompt) +# Verifies: skills injection, MCP server loading, effort control, custom system prompt +name: e2e-skills-mcp +description: "Tests Claude-specific advanced features: skills injection, MCP server, effort control, and systemPrompt." +provider: claude +model: haiku + +nodes: + # 1. Skills injection — verifies AgentDefinition wrapping + - id: skill-test + prompt: "Confirm your skill loading status. If the E2E test skill is loaded, follow its instructions." + skills: + - e2e-test-skill + allowed_tools: [Read] + idle_timeout: 60000 + + # 2. MCP server — verifies MCP config loading and tool availability + - id: mcp-test + prompt: "You have a filesystem MCP server available. Use it to list the contents of /tmp. Report what you find briefly." + mcp: .archon/test-fixtures/mcp/e2e-filesystem.json + idle_timeout: 60000 + depends_on: [skill-test] + + # 3. Effort control — verifies effort passes through to SDK + - id: effort-test + prompt: "Say 'effort-ok' and nothing else." + effort: low + allowed_tools: [] + idle_timeout: 60000 + depends_on: [skill-test] + + # 4. Custom system prompt — verifies systemPrompt injection + - id: system-prompt-test + prompt: "What is your role? Answer in 5 words or fewer." + systemPrompt: "You are a smoke test validator. Always start your response with 'VALIDATOR:'" + allowed_tools: [] + idle_timeout: 60000 + depends_on: [skill-test] + + # 5. Context shared — verifies session continuity + - id: context-shared-setup + prompt: "Remember the secret code: ORANGE-42. Say 'stored' and nothing else." + allowed_tools: [] + idle_timeout: 60000 + depends_on: [skill-test] + + - id: context-shared-verify + prompt: "What was the secret code I told you to remember? Say just the code, nothing else." + context: shared + allowed_tools: [] + idle_timeout: 60000 + depends_on: [context-shared-setup] diff --git a/packages/core/src/orchestrator/orchestrator-agent.test.ts b/packages/core/src/orchestrator/orchestrator-agent.test.ts index ab8165ca7e..8d120e46f4 100644 --- a/packages/core/src/orchestrator/orchestrator-agent.test.ts +++ b/packages/core/src/orchestrator/orchestrator-agent.test.ts @@ -1099,6 +1099,12 @@ describe('workflow dispatch routing — interactive flag', () => { expect(mockExecuteWorkflow).toHaveBeenCalled(); expect(mockDispatchBackgroundWorkflow).not.toHaveBeenCalled(); + + // Verify parentConversationId is passed so resume-after-approval works + const callArgs = mockExecuteWorkflow.mock.calls[0] as unknown[]; + // executeWorkflow is called with 11 positional args; index 10 is parentConversationId + expect(callArgs).toHaveLength(11); + expect(callArgs[10]).toBe('conv-1'); }); test('calls dispatchBackgroundWorkflow for non-interactive workflow on web', async () => { diff --git a/packages/core/src/orchestrator/orchestrator-agent.ts b/packages/core/src/orchestrator/orchestrator-agent.ts index d5eb9397b3..c579c1cdb7 100644 --- a/packages/core/src/orchestrator/orchestrator-agent.ts +++ b/packages/core/src/orchestrator/orchestrator-agent.ts @@ -293,7 +293,10 @@ async function dispatchOrchestratorWorkflow( workflow, userMessage, conversation.id, - codebase.id + codebase.id, + undefined, // issueContext + undefined, // isolationContext + conversation.id // parentConversationId — enables resume after approval gate ); } else { await dispatchBackgroundWorkflow( diff --git a/packages/docs-web/src/content/docs/guides/approval-nodes.md b/packages/docs-web/src/content/docs/guides/approval-nodes.md index 42ebc48fec..e6c02aeec9 100644 --- a/packages/docs-web/src/content/docs/guides/approval-nodes.md +++ b/packages/docs-web/src/content/docs/guides/approval-nodes.md @@ -55,9 +55,10 @@ to the user on whatever platform they're using (CLI, Slack, GitHub, etc.). On th block the worktree path guard (no other workflow can start on the same path). 4. **Approve**: The user approves, which writes a `node_completed` event for the approval node and transitions the run to resumable. Natural-language - messages (recommended) and the CLI auto-resume immediately. The explicit - `/workflow approve` command records the approval; send a follow-up message - to resume. + messages (recommended), the CLI, and the Web UI all auto-resume immediately. + The explicit `/workflow approve` slash command records the approval and also + auto-resumes on the Web UI; on other platforms it requires a follow-up + message to trigger resume. 5. **Reject**: The user rejects. - **Without `on_reject`**: The workflow is cancelled immediately. - **With `on_reject`**: The executor runs the `on_reject.prompt` via AI (with @@ -227,3 +228,7 @@ PR #871). When approved, the run transitions through `failed` status briefly so that `findResumableRun` picks it up — this avoids duplicating resume logic. The `metadata.approval_response` field distinguishes approved-then-resumed from genuinely-failed runs. + +Interactive loop gates follow a different path: the run stays `paused`, the +approve endpoint auto-dispatches to the orchestrator, and the natural-language +resume path (`getPausedWorkflowRun`) handles the transition. diff --git a/packages/docs-web/src/content/docs/guides/authoring-workflows.md b/packages/docs-web/src/content/docs/guides/authoring-workflows.md index c4fdfc7830..1141b88696 100644 --- a/packages/docs-web/src/content/docs/guides/authoring-workflows.md +++ b/packages/docs-web/src/content/docs/guides/authoring-workflows.md @@ -978,7 +978,7 @@ When the workflow reaches `review-gate`, it pauses and notifies you. Approve or - **Natural language** (recommended): Just type your response in the conversation — the system detects the paused workflow and auto-resumes - **CLI**: `bun run cli workflow approve ` or `bun run cli workflow reject ` -- **Explicit command**: `/workflow approve ` or `/workflow reject ` (records approval; send a follow-up message to resume) +- **Explicit command**: `/workflow approve ` or `/workflow reject ` (auto-resumes on Web UI; on other platforms, send a follow-up message after approving) - **Web UI**: Click the Approve/Reject buttons on the dashboard card - **API**: `POST /api/workflows/runs//approve` or `/reject` diff --git a/packages/server/src/routes/api.ts b/packages/server/src/routes/api.ts index 1684a9b773..06f2c21c0f 100644 --- a/packages/server/src/routes/api.ts +++ b/packages/server/src/routes/api.ts @@ -1874,26 +1874,52 @@ export function registerApiRoutes( step_name: approval.nodeId, data: { node_output: nodeOutput, approval_decision: 'approved' }, }); + await workflowEventDb.createWorkflowEvent({ + workflow_run_id: runId, + event_type: 'approval_received', + step_name: approval.nodeId, + data: { decision: 'approved', comment }, + }); + // Transition to 'failed' so findResumableRunByParentConversation picks it up. + // Clear any prior rejection state. + await workflowDb.updateWorkflowRun(runId, { + status: 'failed', + metadata: { approval_response: 'approved', rejection_reason: '', rejection_count: 0 }, + }); + return c.json({ + success: true, + message: `Workflow approved: ${run.workflow_name}.`, + }); } - await workflowEventDb.createWorkflowEvent({ - workflow_run_id: runId, - event_type: 'approval_received', - step_name: approval.nodeId, - data: { decision: 'approved', comment }, - }); - // For interactive loops, store user input; for standard approvals, mark as approved - // and clear any rejection state. - const metadataUpdate = - approval.type === 'interactive_loop' - ? { loop_user_input: comment } - : { approval_response: 'approved', rejection_reason: '', rejection_count: 0 }; + // Interactive loop path: store user input, keep status 'paused' so getPausedWorkflowRun + // finds it, then auto-dispatch to orchestrator to resume without requiring a manual message. await workflowDb.updateWorkflowRun(runId, { - status: 'failed', - metadata: metadataUpdate, + metadata: { loop_user_input: comment }, + }); + // Auto-resume: inject the approval as a message into the parent conversation. + // The orchestrator's natural-language approval path writes approval_received and + // dispatches the resumed workflow. + const parentConvDbId = run.parent_conversation_id ?? run.conversation_id; + const parentConv = await conversationDb.getConversationById(parentConvDbId); + if (!parentConv?.platform_conversation_id) { + // Can't auto-dispatch — surface the failure so the user can resume manually. + getLog().error( + { runId, parentConvDbId, workflowName: run.workflow_name }, + 'api.workflow_run_approve_interactive_loop_no_parent_conv' + ); + return apiError( + c, + 500, + 'Workflow approved but could not auto-resume: parent conversation not found. ' + + 'Send a message to continue the workflow.' + ); + } + void dispatchToOrchestrator(parentConv.platform_conversation_id, comment).catch(err => { + getLog().error({ err, runId }, 'api.workflow_run_approve_interactive_loop_dispatch_failed'); }); return c.json({ success: true, - message: `Workflow approved: ${run.workflow_name}. Send a message to continue the workflow.`, + message: `Workflow approved and resuming: ${run.workflow_name}.`, }); } catch (error) { getLog().error({ err: error, runId }, 'api.workflow_run_approve_failed'); diff --git a/packages/server/src/routes/api.workflow-runs.test.ts b/packages/server/src/routes/api.workflow-runs.test.ts index 41bee85003..f7ad11dc38 100644 --- a/packages/server/src/routes/api.workflow-runs.test.ts +++ b/packages/server/src/routes/api.workflow-runs.test.ts @@ -1251,6 +1251,136 @@ describe('POST /api/workflows/runs/:runId/approve', () => { data: { node_output: '', approval_decision: 'approved' }, }); }); + + test('transitions standard approval run to failed status with cleared rejection metadata', async () => { + mockGetWorkflowRun.mockResolvedValueOnce(MOCK_PAUSED_RUN); + const { app } = makeApp(); + await app.request('/api/workflows/runs/run-paused-1/approve', { + method: 'POST', + body: JSON.stringify({ comment: 'LGTM' }), + headers: { 'Content-Type': 'application/json' }, + }); + expect(mockUpdateWorkflowRun).toHaveBeenCalledWith('run-paused-1', { + status: 'failed', + metadata: { approval_response: 'approved', rejection_reason: '', rejection_count: 0 }, + }); + }); +}); + +// --------------------------------------------------------------------------- +// Tests: POST /api/workflows/runs/:runId/approve — interactive_loop branch +// --------------------------------------------------------------------------- + +const MOCK_LOOP_RUN: MockWorkflowRun = { + ...MOCK_RUNNING_RUN, + id: 'run-loop-1', + status: 'paused', + conversation_id: 'worker-conv-uuid', + parent_conversation_id: 'parent-conv-uuid', + metadata: { + approval: { + type: 'interactive_loop', + nodeId: 'loop-gate', + message: 'Please provide feedback', + }, + }, +}; + +describe('POST /api/workflows/runs/:runId/approve — interactive_loop branch', () => { + beforeEach(() => { + mockGetWorkflowRun.mockReset(); + mockUpdateWorkflowRun.mockReset(); + mockCreateWorkflowEvent.mockReset(); + mockGetConversationById.mockReset(); + mockHandleMessage.mockReset(); + }); + + test('keeps run status paused and stores loop_user_input in metadata', async () => { + mockGetWorkflowRun.mockResolvedValueOnce(MOCK_LOOP_RUN); + mockGetConversationById.mockResolvedValueOnce({ + id: 'parent-conv-uuid', + platform_conversation_id: 'web-parent-abc', + }); + const { app } = makeApp(); + const response = await app.request('/api/workflows/runs/run-loop-1/approve', { + method: 'POST', + body: JSON.stringify({ comment: 'Looks great, continue' }), + headers: { 'Content-Type': 'application/json' }, + }); + expect(response.status).toBe(200); + // Must NOT call node_completed — executor writes that on actual loop exit + const nodeCompletedCall = mockCreateWorkflowEvent.mock.calls.find( + (c: unknown[]) => (c[0] as Record).event_type === 'node_completed' + ); + expect(nodeCompletedCall).toBeUndefined(); + // Status must stay paused — not transition to 'failed' + expect(mockUpdateWorkflowRun).toHaveBeenCalledWith('run-loop-1', { + metadata: { loop_user_input: 'Looks great, continue' }, + }); + const callArg = mockUpdateWorkflowRun.mock.calls[0][1] as Record; + expect(callArg).not.toHaveProperty('status'); + // Message must indicate auto-resuming + const body = (await response.json()) as { success: boolean; message: string }; + expect(body.message).toContain('resuming'); + expect(body.message).not.toContain('Send a message'); + }); + + test('dispatches to parent conversation when parent_conversation_id is set', async () => { + mockGetWorkflowRun.mockResolvedValueOnce(MOCK_LOOP_RUN); + mockGetConversationById.mockResolvedValueOnce({ + id: 'parent-conv-uuid', + platform_conversation_id: 'web-parent-abc', + }); + const { app } = makeApp(); + await app.request('/api/workflows/runs/run-loop-1/approve', { + method: 'POST', + body: JSON.stringify({ comment: 'proceed' }), + headers: { 'Content-Type': 'application/json' }, + }); + // Allow fire-and-forget microtask to flush + await new Promise(resolve => setTimeout(resolve, 0)); + expect(mockHandleMessage).toHaveBeenCalledWith( + expect.anything(), + 'web-parent-abc', + 'proceed', + expect.anything() + ); + }); + + test('falls back to conversation_id when parent_conversation_id is null', async () => { + const runNullParent = { + ...MOCK_LOOP_RUN, + parent_conversation_id: null, + conversation_id: 'worker-conv-uuid', + }; + mockGetWorkflowRun.mockResolvedValueOnce(runNullParent); + mockGetConversationById.mockResolvedValueOnce({ + id: 'worker-conv-uuid', + platform_conversation_id: 'web-worker-abc', + }); + const { app } = makeApp(); + await app.request('/api/workflows/runs/run-loop-1/approve', { + method: 'POST', + body: JSON.stringify({ comment: 'go' }), + headers: { 'Content-Type': 'application/json' }, + }); + expect(mockGetConversationById).toHaveBeenCalledWith('worker-conv-uuid'); + }); + + test('returns 500 when parent conversation cannot be resolved', async () => { + mockGetWorkflowRun.mockResolvedValueOnce(MOCK_LOOP_RUN); + mockGetConversationById.mockResolvedValueOnce(null); + const { app } = makeApp(); + const response = await app.request('/api/workflows/runs/run-loop-1/approve', { + method: 'POST', + body: JSON.stringify({ comment: 'proceed' }), + headers: { 'Content-Type': 'application/json' }, + }); + expect(response.status).toBe(500); + const body = (await response.json()) as { error: string }; + expect(body.error).toContain('could not auto-resume'); + expect(mockHandleMessage).not.toHaveBeenCalled(); + }); }); // ---------------------------------------------------------------------------