From 6c9bef82063a9215a8b2e596bc380ff7b3c92eaf Mon Sep 17 00:00:00 2001 From: Catherine Date: Mon, 18 May 2026 18:22:06 +0800 Subject: [PATCH 1/2] docs(boot): add commit message guardrails Document local and range-based commit message checks for new project scaffolds so generated repos reject attribution and tool-advertising trailers before they reach protected history. --- commands/boot-new.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/commands/boot-new.md b/commands/boot-new.md index df48920..0c8f287 100644 --- a/commands/boot-new.md +++ b/commands/boot-new.md @@ -131,6 +131,8 @@ This command creates the **context infrastructure** that enables smooth vibe cod - **Mandatory Hooks**: `trailing-whitespace`, `end-of-file-fixer`, `check-yaml`. - **Language Hooks**: `ruff` (Python), `fmt` (Rust), `prettier` (Web). - **Go hooks**: `gofumpt`, `goimports`, `go-mod-tidy`, `go-vet`, `golangci-lint`. Use the **active fork** `TekWizely/pre-commit-golang` — the older `dnephin/pre-commit-golang` is archived (upstream abandoned ~2023) and MUST NOT be used for new projects. If the active fork's hook set is insufficient, prefer a `local` hook that calls `make lint` directly rather than pulling in an archived dep. + - **Commit message policy**: generate a local `commit-msg` hook that runs `scripts/check-commit-message.sh` and rejects AI attribution, AI co-author trailers, generated-by/generated-with signatures, vendor/tool advertising phrases, and AI-agent names anywhere in commit messages. The blocklist must include at least: `Cursor`, `cursoragent@cursor.com`, `Claude`, `Codex`, `ChatGPT`, `OpenAI`, `Anthropic`, `Gemini`, `Copilot`, `Windsurf`, `Aider`, `Devin`, `Replit Agent`, `Qodo`, `Tabnine`, `Amazon Q`, `CodeWhisperer`, `Sourcegraph Cody`, `Cline`, `Roo Code`, `OpenCode`, `Kilo Code`, `Trae`, `Continue.dev`, `Codeium`, `Augment Code`, and `Zed AI`. The hook must fail closed with a clear error; it must not rewrite the message silently. + - **Commit range policy**: generate `scripts/check-commit-range.sh` for CI/pre-push use. It must inspect every commit in a provided rev-list range and reject the same forbidden AI attribution, advertising, and agent-name patterns. This protects against `git commit --no-verify`, injected `git commit --trailer`, and local hook drift. Commit history is not an advertising surface for tool vendors. 3. **Scaffold Project Rules**: - Copy `.cursor/rules/*.mdc` templates into the project's `.cursor/rules/`. - Generate `AGENTS.md` at project root for local cross-tool compatibility. @@ -151,8 +153,9 @@ This command creates the **context infrastructure** that enables smooth vibe cod 3. **Bootstrap CI (Minimal)**: - **Action**: Generate `.github/workflows/ci.yml`. - **Logic**: "Fail Fast". - - **Steps**: Checkout -> Setup Cache -> Lint (Strict) -> Test (Unit). + - **Steps**: Checkout with full history -> Check commit messages with `scripts/check-commit-range.sh` -> Setup Cache -> Lint (Strict) -> Test (Unit). - **Constraint**: Pin versions (for example `ubuntu-24.04` or the project-approved runner image, and pinned actions). Do not describe floating `latest` runner tags as pinned. + - **GitHub Actions range check**: use `fetch-depth: 0`. For pull requests, check `origin/${GITHUB_BASE_REF}..HEAD`; for pushes, check `${{ github.event.before }}..HEAD` and fall back to `HEAD` when the before SHA is all zeros. This status check is mandatory for protected branches. ### Phase 7: UI Target Scaffold (web | desktop | mobile) From b23d4726bff742750182bf9e24524dd45a5cfd96 Mon Sep 17 00:00:00 2001 From: Catherine Date: Mon, 18 May 2026 18:22:19 +0800 Subject: [PATCH 2/2] feat(workflow): add orchestrator command Add workflow-orchestrator as the monitored workflow entrypoint and long-running session owner. The command records local agent state, coordinates role spawning, requires provider approval, and routes models by quality, cost, speed, context, prior outcomes, and reviewer independence. Update workflow docs, reviewer policy, metadata inference, and generated-output tests so the new command exports through the normal target pipeline. --- README.md | 6 +- commands/boot-workflow.md | 7 +- commands/workflow-boss.md | 11 +- commands/workflow-orchestrator.md | 255 ++++++++++++++++++++++++++++++ commands/workflow-reviewer.md | 13 ++ scripts/agent-surface.mjs | 7 +- tests/agent-surface.test.mjs | 55 +++++-- 7 files changed, 331 insertions(+), 23 deletions(-) create mode 100644 commands/workflow-orchestrator.md diff --git a/README.md b/README.md index 11140d8..652e78a 100644 --- a/README.md +++ b/README.md @@ -137,7 +137,11 @@ Canonical workflow state lives under `.agent-surface/workflows//`. Cursor-specific `.cursor/.workflow/` files are compatibility artifacts only; canonical `.agent-surface` state wins when they disagree. -Start with `/flow` when the right path is unclear. Use `workflow-doctor` before acting on an existing run, and `workflow-close` to archive metrics and unresolved risks after an accepted or aborted run. +Start with `/flow` when the right path is unclear. For formal workflow mode, start `workflow-orchestrator`; it is the long-running monitor session that spawns BOSS, worker, reviewer, judger, rescue, QA, and close roles until the run has no remaining work or automation must stop. Use `workflow-doctor` before acting on an existing run, and `workflow-close` to archive metrics and unresolved risks after an accepted or aborted run. + +`workflow-boss` should queue the largest coherent batch it can specify safely; `workflow-reviewer` gates the batch across AC compliance, security, docs, dependency hygiene, tests, config, and other QA risks before anything reaches `ship-commit`. + +`workflow-orchestrator` chooses the next role, provider, model, and launch shape from the ledger, local availability, cost, speed, context, prior outcomes, and required model independence. It records local `agents.json` state and keeps monitoring until the remaining task queue is finished, closed, quarantined, aborted, or handed to a human. It does not replace BOSS, worker, reviewer, judger, rescue, or close role-file ownership. Workflow-aware workers currently include `dev-feature`, `dev-fix`, `dev-chore`, and `dev-refactor`. diff --git a/commands/boot-workflow.md b/commands/boot-workflow.md index 5f3df97..2bb7a60 100644 --- a/commands/boot-workflow.md +++ b/commands/boot-workflow.md @@ -41,14 +41,19 @@ User invokes command (e.g., "dev-refactor", "arch-roadmap") ↑ ↑ ↓ ↓ ↓ [ops-report] ─→ [dev-fix] ──────┴─ [dev-refactor] ←┘ ↑ ↑ - [qa-sec] + [qa-sec] ``` +For formal workflow mode, start `workflow-orchestrator`. It is the long-running monitor that spawns each role session, chooses provider/model/role assignments from quality, cost, speed, context, prior outcomes, and review independence, and stays active until the run is closed, quarantined, aborted, human-blocked, or has no remaining task work. `boot-workflow` remains the workflow map, ledger contract, and role-file reference. + ### Structured workflow (v3 — validated run ledger) For medium/high-risk tasks requiring formal spec → implement → verify → review cycles. **A round processes a batch of tasks**, not a single task. ```text +[workflow-orchestrator] [long-running monitor: choose role + provider/model, spawn, heartbeat, rehydrate] + | + v [workflow-boss] [validated run: run_id + lock + base commit/tree + filescope + tasks[] + batch_policy] | v diff --git a/commands/workflow-boss.md b/commands/workflow-boss.md index 9518e76..0674f34 100644 --- a/commands/workflow-boss.md +++ b/commands/workflow-boss.md @@ -16,7 +16,8 @@ If essential repo context is missing, run `boot-context` first. - **The worker burns through the queue** in dependency order, stopping only on (a) a blocker on the current task, (b) self-assessed context pressure, or (c) the queue going empty. - **The reviewer reviews the batch**, emitting per-task verdicts plus an aggregate status. - **Single-task runs are still legal** — a queue of one uses the same v3 ledger shape. -- **Don't over-batch.** Group tasks that share FILESCOPE, contract, or test surface. Don't queue 30 unrelated chores into one BOSS — that defeats reviewer focus. +- **Maximize coherent batching.** Assign as many related tasks as can be specified safely in one round, constrained by shared FILESCOPE, route, dependency chain, risk, context budget, and verification cost. +- **Don't over-batch.** Group tasks that share FILESCOPE, contract, or test surface. Don't queue unrelated chores or mixed-risk work into one BOSS — that defeats reviewer focus. - **Use workflow only when it earns its overhead.** Direct small edits do not need six roles. Use this path for medium/high-risk, multi-step, cross-file, security-sensitive, or ambiguity-heavy work. - **The run ledger is the source of truth.** `run.json` plus `events.ndjson` records state transitions; role files are artifacts, not state authority. @@ -24,7 +25,7 @@ If essential repo context is missing, run `boot-context` first. ### Phase 0: Workflow bootstrap (role files) -- `workflow-boss` is the entrypoint that starts or refreshes workflow mode. +- `workflow-orchestrator` is the monitored workflow entrypoint. `workflow-boss` is the first spec-owning role it spawns, and it may still be invoked directly for manual respec or recovery. - Use the project-local workflow folder: `.agent-surface/workflows//` - Write the active-run pointer at `.agent-surface/workflows/current.json`. - Treat `.cursor/.workflow/` as a Cursor compatibility surface only. It may mirror latest role files, but it is not canonical. @@ -72,11 +73,13 @@ If essential repo context is missing, run `boot-context` first. Build a list of 1 to N tasks. Each task is independently reviewable, verifiable, and revertable. - Sort tasks by dependency. Use `depends_on: [task_id, …]` when a task requires the output of an earlier one. +- Prefer a full useful queue over a tiny handoff. If more related tasks fit the same route, FILESCOPE, and verification surface, include them instead of leaving obvious follow-up tasks for a later BOSS round. - Prefer one concern, one testable behavior, one reversible patch per task. Estimate by filescope size, dependency count, and verify cost, not line count. +- Split the run instead of batching when tasks require different worker routes, unrelated filescopes, incompatible risk levels, or separate human approvals. - Each task gets its own `plan` (3–8 steps), `ac`, and `verify` commands. - Don't pre-commit the worker to a specific implementation — leave room for judgment, but pin the contract. - Each task must be isolatable: require either `patch_required: true` (default) or `commit_required: true` if the user wants per-task commits. Without patch/commit isolation, `MERGE_PARTIAL` is disabled. -- Default `max_tasks_per_round` to 3 unless the task queue is tiny or low-risk. Larger batches require a short justification. +- Default `max_tasks_per_round` to 5 for coherent low/medium-risk queues. Use fewer for high-risk, ambiguous, cross-cutting, dependency, data, security, or approval-gated work. Larger batches require a short justification tied to shared filescope and cheap verification. Keep the process deterministic: route by explicit `workflow.next_command` plus the task queue, not free-form agent debate. @@ -110,7 +113,7 @@ Define when the worker should stop the round: - `stop_on`: subset of `["blocker", "context_pressure", "queue_empty", "max_tasks_cap", "drift_check"]` (default: all five). - `context_pressure_threshold_pct`: heuristic for context budget (default: 70), backed by concrete counters. -- `max_tasks_per_round`: default 3. Use 5 only for low-risk small tasks; justify anything higher. +- `max_tasks_per_round`: default 5 for coherent low/medium-risk queues. Use 1–3 for high-risk or unclear work; justify anything higher than 5. - `drift_check_every`: re-read BOSS spec after this many completed tasks (default: 5). Catches scope drift before it metastasizes. - `timeout_budget_ms`: max elapsed runner time for a round before handoff. - Context pressure counters: files opened, total bytes read, commands run, verify cycles, log bytes, failed attempts, elapsed time, and model-reported context usage when available. diff --git a/commands/workflow-orchestrator.md b/commands/workflow-orchestrator.md new file mode 100644 index 0000000..45d1c44 --- /dev/null +++ b/commands/workflow-orchestrator.md @@ -0,0 +1,255 @@ +## OBJECTIVE + +Run the long-running workflow monitor without replacing workflow roles, role-file ownership, or ledger authority. + +`workflow-orchestrator` is the normal workflow-mode entrance and session owner. It starts or rehydrates the active run, spawns each downstream role, chooses provider/model/role assignments, records heartbeats, and keeps control until the task queue is finished or automation must stop. + +The monitor may plan, spawn, observe, and hand off to approved headless role sessions. It is not BOSS, worker, reviewer, judger, rescue, close, or ship. It invokes those commands as managed role sessions, then validates their artifacts and advances by the ledger. + +Direct role commands remain valid for manual recovery, debugging, or user-directed single-role work. They are not the normal start point for a monitored workflow run. + +## ROLE HIERARCHY + +- First level: `workflow-orchestrator` monitors workflow state, starts or resumes managed role sessions, selects role/provider/model assignments, records local agent state, verifies handoff artifacts exist, and advances only by the ledger's next command. +- Managed role sessions: `workflow-boss`, `dev-feature`, `dev-fix`, `dev-chore`, `dev-refactor`, `workflow-reviewer`, `workflow-judger`, `workflow-rescue`, `workflow-close`, and workflow-aware QA routes such as `qa-trace` or `qa-review`. +- The monitor must not directly write `boss.json`, `worker.json`, `reviewer.json`, `judger.json`, or `rescue.json` on behalf of those roles. +- The monitor must not directly implement code changes, review patches, adjudicate failures, rescue a run, ship work, or bypass `run.json.workflow_next_command` / artifact `workflow.next_command`. +- The monitor must not exit while `run.json` still has remaining, active, rework, or deferred task IDs that can be advanced automatically. +- Ledger artifacts are authoritative. Model/provider sessions are cache and may be discarded, respawned, or marked stale. + +Durable workflow authority lives in `.agent-surface/workflows//run.json`, `events.ndjson`, canonical round artifacts, role files, patch manifests, and evidence refs. The monitor registry is local execution state only. + +## AUTHORIZATION GATE + +- Reading workflow artifacts and writing `.agent-surface/workflows//agents.json` are local monitor actions. +- Provider availability probes, model probes, headless launches, live installs, or any command that contacts an external service require explicit user approval in the current turn or durable project policy. +- If approval is missing, write a launch plan or dry-run summary and stop before invoking the provider. +- Do not pass secrets, `.env` contents, cookies, credentials, or raw customer data to external role sessions. Prompts must be scoped to task specs, filescope, role contracts, hashes, and redacted evidence refs. +- Networked Node-based CLIs must run with TLS verification enabled; strip inherited `NODE_TLS_REJECT_UNAUTHORIZED` instead of trusting the parent shell. + +## MONITOR DUTIES + +1. Resolve `.agent-surface/workflows/current.json`, `run.json`, and the latest role files. If no valid active run exists, spawn `workflow-boss` to create one. +2. Run `workflow-doctor` or equivalent structural validation before launching any role. +3. Validate `run_id`, round, branch, lock, parent hashes, active task IDs, file scope, `run.json.workflow_next_command`, and artifact `workflow.next_command` before launching any role. +4. Start by invoking `workflow-boss` unless an active run is already valid. Do not synthesize BOSS artifacts from monitor memory. +5. Spawn or reuse a managed role session only when its `run_id`, command, role class, task scope, file scope, and base tree match the ledger. +6. Choose the role, provider, model, session shape, and write access from the ledger route, task risk, local availability, quota, context size, quality/cost/speed tradeoff, prior outcomes, approval state, and whether the role needs source writes. +7. Feed each role only the task spec, filescope, role contract, relevant role files, and redacted evidence refs it needs. +8. Require heartbeat evidence for long-running sessions: current task, files touched, command running, last check result, current blocker, and latest artifact path. +9. After a role writes its artifact, validate the file exists, update monitor state, and follow the next command from the ledger. +10. Mark lost, mismatched, or superseded sessions as `stale` or `closed`. Stale session memory must never override a fresh role artifact. +11. Continue until `workflow-close` finishes the run, automation reaches `requires_human: true`, or the run is explicitly quarantined or aborted. Do not stop just because one role session returned. + +## AGENT REGISTRY + +When the monitor spawns or reuses agents, keep `.agent-surface/workflows//agents.json`. This file is project-local workflow state and must not be committed. + +Minimum shape: + +```json +{ + "schema_version": "workflow.monitor.v1", + "run_id": "same run_id as run.json", + "agents": [ + { + "agent_id": "stable monitor-local id", + "command": "workflow-boss|dev-feature|dev-fix|dev-chore|dev-refactor|workflow-reviewer|workflow-judger|workflow-rescue|workflow-close|qa-trace|qa-review", + "role_class": "boss|worker|reviewer|judger|rescue|closer|qa", + "provider": "codex|claude-code|gemini|cursor|grok|ollama-codex|ollama-claude|ollama-opencode|other", + "model": "provider model id", + "task_ids": [], + "filescope": [], + "session_ref": "sub-agent id, process id, tty id, or external session handle", + "prompt_ref": "path to the exact redacted prompt file", + "log_ref": "path or evidence ref", + "approval_ref": "user message, durable policy, or null for local-only sessions", + "selection_snapshot": { + "quality_tier": "frontier|strong|economy|experimental|unknown", + "cost_tier": "premium|standard|economy|unknown", + "speed_tier": "fast|balanced|slow|unknown", + "context_window_tokens": 0, + "max_output_tokens": 0, + "tool_fit": ["terminal", "apply_patch", "structured_output"], + "independence_group": "openai|anthropic|google|deepseek|moonshot|zai|minimax|other", + "selection_reason": "why this model is assigned to this role", + "source_refs": ["provider probe, pricing page, leaderboard snapshot, or local benchmark ref"] + }, + "outcome_summary": { + "role_result": "pending|accepted|partial|rejected|failed|stale", + "wall_time_seconds": 0, + "estimated_cost_usd": null, + "tokens_in": null, + "tokens_out": null, + "blind_spots": [], + "lesson": "short reusable lesson or null" + }, + "status": "running|idle|done|failed|stale|closed", + "started_at": "ISO-8601 timestamp", + "last_heartbeat_at": "ISO-8601 timestamp" + } + ] +} +``` + +## SPAWN AND REHYDRATE + +1. Read the ledger first: `current.json`, `run.json`, `boss.json`, latest downstream role file, patch manifests, and redacted evidence refs. +2. Choose the next managed role from `run.json.workflow_next_command` and artifact `workflow.next_command`; do not infer the route from mtime, chat history, or provider session state. +3. Pick a provider/model only after checking local availability, quota, approval state, risk, context size, independence from adjacent roles, and whether the role needs write access. +4. Launch the role with a strict prompt containing role command, allowed output file, exact `run_id`, round, task IDs, filescope, parent artifact hashes, allowed checks, and the rule that ledger artifacts are authoritative. +5. If the agent loses context, exits, or drifts from scope, respawn and rehydrate from role files and evidence refs. +6. After each role exits, re-read `run.json`, the new role artifact, and `events.ndjson`; then spawn the next role if work remains. +7. If ledger evidence is missing or inconsistent, stop and hand off to the user or `workflow-rescue`; do not reconstruct state from chat memory. + +## EXIT CONDITIONS + +The orchestrator remains active until one of these terminal conditions is reached: + +- `workflow-close` writes a closed or aborted run and clears `current.json`. +- `run.json.status` becomes `closed`, `quarantined`, or `aborted`. +- A role artifact sets `requires_human: true` or `workflow.next_command: null` for a human-required state. +- No active, rework, deferred, or remaining task IDs can be advanced, and the next command is `workflow-close`. + +Non-terminal role completion is not an orchestrator exit condition. It is a handoff point to validate artifacts, update `agents.json`, and spawn the next role. + +Headless launch patterns are examples. Verify flags locally before use and satisfy the authorization gate before running them. + +```bash +env -u NODE_TLS_REJECT_UNAUTHORIZED grok -m grok-build --cwd "$repo" --prompt-file "$prompt_file" --output-format json --max-turns 12 +env -u NODE_TLS_REJECT_UNAUTHORIZED grok agent headless -m grok-build --cwd "$repo" --prompt-file "$prompt_file" --output-format json +env -u NODE_TLS_REJECT_UNAUTHORIZED cursor agent --workspace "$repo" --model gpt-5 --print --output-format json "$(cat "$prompt_file")" +env -u NODE_TLS_REJECT_UNAUTHORIZED PATH=/opt/homebrew/bin:$PATH GEMINI_CLI_TRUST_WORKSPACE=true gemini --prompt "$(cat "$prompt_file")" --output-format json --model gemini-3-pro-preview +ollama launch codex --model kimi-k2.6:cloud +ollama launch opencode --model minimax-m2.7:cloud +ollama launch claude --model glm-5.1:cloud -- -p "$(cat "$prompt_file")" +``` + +Cursor and Grok headless modes can write files and run shell commands. Give them the same role contract, filescope, and evidence requirements as any other managed role. Keep provider logs local or redact them before referencing evidence. + +## HEARTBEATS AND STALE SESSIONS + +- Update `agents.json` after spawn, heartbeat, handoff, failure, and close. +- A heartbeat should name the current task IDs, files touched, command running, most recent check status, latest artifact/evidence refs, and blocker if present. +- Treat a session as stale when `run_id`, role, task scope, filescope, branch, base tree, or round no longer matches the ledger. +- Kill or close stale sessions when the provider supports it. Otherwise record status `stale` and stop feeding it new context. +- Repeated task rejection routes to `workflow-judger`; the monitor must not loop the same worker indefinitely. +- Human intervention is required if a lock, artifact hash, patch manifest, or role owner contract is inconsistent. + +## PROVIDER NOTES + +Probe provider availability before each monitored run. Treat display names, leaderboard labels, and old notes as hints, not launchable IDs. + +| Provider | Compact note | +| --- | --- | +| Cursor | May need `HTTP_PROXY=http://127.0.0.1:7890` and `HTTPS_PROXY=http://127.0.0.1:7890`. If it reports no account models, do not assign it workflow roles. | +| Gemini | Requires compatible Node. User reports Node v26 / npm 11.12.1 works; Node v18 can fail. Use `GEMINI_CLI_TRUST_WORKSPACE=true` or `--skip-trust` for headless runs. | +| Grok | Build model id is `grok-build`; use with caution and verify local CLI behavior before assigning write-scoped roles. | +| Ollama Cloud | Verify model IDs before use. Direct `ollama run` supports `--think false --hidethinking` when evidence logs must not include reasoning traces. | + +## MODEL ROUTING + +No external model is a durable default. Re-probe provider IDs, quota, context size, write capability, and observed behavior before each monitored run, then record the probe command and result in local evidence. + +Optimize the workflow, not a single role. Model choice is a trade-off across: + +- **Quality**: coding, tool use, reasoning, instruction following, and repo-scale consistency. +- **Cost**: token price, prompt-cache behavior, long-context surcharges, subscription quota, and retry rate. +- **Speed**: TTFT, generation throughput, provider queueing, and CLI startup overhead. +- **Context**: usable context window, max output, compaction behavior, and whether reasoning tokens consume budget. +- **Tool fit**: shell/write support, structured output, patch application, file search, MCP, and provider-specific headless reliability. +- **Independence**: reviewer and judger should normally use a different provider family or model class from the worker, so the review is less likely to share the worker's blind spots. +- **Privacy and approval**: local-only constraints, external-service approval, and whether prompts contain sensitive code or evidence. +- **Observed outcomes**: prior accept/reject rates, failure modes, drift incidents, cost, latency, and user corrections from this repo. + +Do not always choose the highest-quality model. Use expensive frontier models where failure cost is high; use cheaper or faster models for bounded worker tasks, routine chores, shadow reviews, and exploration. A premium model that creates perfect code too slowly or expensively can still be the wrong orchestration choice. + +Do not assign the same model family to worker and reviewer for non-trivial work when an approved alternative exists. If the worker used `gpt-5.5`, prefer Claude, Gemini, DeepSeek, Kimi, GLM, or another independent reviewer candidate; if the worker used Claude, prefer OpenAI, Gemini, DeepSeek, Kimi, GLM, or another independent reviewer candidate. If forced to reuse the same family, record the reason and require stricter evidence validation. + +Match model signals to the role instead of collapsing everything into one leaderboard rank: + +- Implementation workers: prefer coding, SWE-bench, Terminal-Bench, Code Arena, tool-use, patch fidelity, speed, and cost. +- Reviewers/judgers: prefer reasoning, GPQA/HLE-style hard-question scores, evidence discipline, tool-use, and model-family independence from the worker. +- BOSS/planning: prefer instruction following, context reliability, reasoning, repo-scale consistency, and low tendency to over-specify implementation. +- QA/security/research: prefer search, BrowseComp/SimpleQA, domain benchmarks, tool calling, and conservative uncertainty handling. +- UI/visual tasks: prefer vision/MMMU/MMMU-Pro/ScreenSpot-style signals and multimodal launch support. +- Long runs: penalize high TTFT/latency, weak heartbeats, quota fragility, expensive retries, and models that degrade under context pressure even if nominal context is large. + +| Role class | Selection rule | Guardrail | +| --- | --- | --- | +| `boss` | Highest-trust available planner with enough context for repo evidence, task queue, filescope, policy surface, and batching. | Must not write implementation files. Avoid overpaying for tiny scopes; if no approved external launch exists, use the current local agent or direct `workflow-boss`. | +| `worker` | Coding-capable model with the best cost/speed/quality balance for the scoped patch. | Require patch isolation, evidence capture, and hard stop on blocker or scope drift. Prefer economy/standard candidates for low-risk narrow tasks; escalate to frontier only for hard implementation. | +| `reviewer` / `judger` | Independent strong reasoning model with enough context for BOSS, worker output, patch manifests, and evidence refs. | Prefer a different provider family from the worker. Prefer log/hash validation over raw provider summaries. Missing evidence means fail closed. | +| `rescue` | Senior model or human path chosen for the specific failure mode and blind spot observed. | PATCH mode is proposed-only unless the user explicitly authorizes takeover. Prefer a model family not used in the failed worker/reviewer loop. | +| `qa` | Domain-specific reviewer or cheaper shadow model only after the BOSS/reviewer route requires it. | Security or network-sensitive probes still require explicit approval. Use multiple cheap QA passes only when they add independent coverage. | + +Seed candidate matrix for May 2026. Treat it as a starting point, not truth. Refresh launch IDs, pricing, quota, context, and benchmark rows before a real run. + +| Candidate | Current strengths | Trade-off / caution | Good default roles | +| --- | --- | --- | --- | +| `gpt-5.5` | Strong all-purpose frontier choice: high coding/reasoning/tool/vision signals, Terminal-Bench leadership, 1M-class context, broad hosted tools. | Premium output cost, slow TTFT in public snapshots, and bad reviewer independence if it also wrote the patch. | BOSS for complex runs, hard worker, independent reviewer for non-OpenAI workers, rescue. | +| `gpt-5.4` | Strong balanced OpenAI model: much faster and cheaper than `gpt-5.5`, strong coding/tool signals, 1M-class context. | Lower ceiling than `gpt-5.5`; still too expensive for throwaway bulk work. | Balanced BOSS, worker, fast reviewer when worker was non-OpenAI. | +| `gpt-5.3-codex` / `gpt-5.2-codex` | Coding-specialized, fast, strong Terminal-Bench/SWE-style signals for patch loops. | Smaller context than GPT-5.4/5.5 and sparse non-coding benchmark coverage; availability may be Codex-specific. | Worker, patch repair, focused code review for non-OpenAI workers. | +| `claude-opus-4-7` | Strong SWE-bench Pro/Verified, reasoning, MCP/tool-orchestration, vision, and code-quality signals; fast TTFT versus several frontier peers. | Premium cost; weaker search signal; avoid same-family worker/reviewer pairing. | Hard worker, reviewer/judger against OpenAI or economy workers, rescue. | +| `claude-sonnet-4-6` | Cheaper than Opus, solid OS/browser/visual-agent and coding baseline, useful Claude-family diversity. | 200K context and weaker frontier coding/reasoning than Opus. | Standard worker, UI/browser QA, reviewer for low/medium-risk non-Claude workers. | +| `gemini-3.1-pro-preview` | Highest Code Arena-style signal in the provided snapshot, strong GPQA/MMMU/HLE/BrowseComp and multimodal/tool support, 1M input context. | Preview model; high latency; long-context leaderboard signal can lag nominal context; tool score is not top-tier. | Long-context BOSS, research/QA, reviewer diversity, multimodal or tool-heavy worker after probe. | +| `gemini-3-flash` | Very fast and cheap with 1M-class context and strong math signal. | Coding/tool/review quality is materially lower than frontier models. | Cheap triage, summarization, low-risk QA, broad queue scanning. | +| `deepseek-v4-pro` / provider `pro-max` label | Low cost for 1M-class context, strong math/reasoning/domain and decent SWE/coding signals; useful non-Western model-family diversity. | Very high latency in public snapshot, no multimodal in that row, pricing labels/discounts change, quality must be proven locally. | Economy worker for large-context tasks, shadow reviewer, second opinion before escalating to premium models. | +| `deepseek-v4-flash` / provider `flash-max` label | Extremely cheap, 1M-class context, useful for bulk exploration. | Lower coding/tool/search signals than Pro; latency still non-trivial; not a sole high-risk reviewer. | Low-risk worker, bulk scan, cheap shadow QA. | +| `kimi-k2.6` | Cheap coding/tool candidate with good search, SciCode, GPQA, SWE-bench Verified, and BrowseComp signals; useful independent reviewer versus OpenAI/Claude. | 262K context and high latency in public snapshot; verify launch ID and local harness behavior. | Worker, chore/refactor, second-opinion reviewer, research QA. | +| `glm-5.1` | Cheap/fast coding-reasoning candidate with decent MCP/tool and SWE-bench Pro signals; good model-family diversity. | 200K context, no multimodal in snapshot, and possible long-loop behavior; require heartbeat stops. | Worker, reviewer diversity, rescue second opinion. | +| `glm-5` | Very fast, cheap, high Code Arena among economy choices. | Weaker coding/tool/reasoning than GLM-5.1 and no multimodal in snapshot. | Low-risk worker, docs/chore, cheap exploratory pass. | +| `qwen3.5-397b` / `qwen3.6-plus` | Cheap long-context family with useful finance/legal/health/domain signals and reasonable GPQA. | Weaker coding/tool signals and slow latency in snapshot; not primary implementation reviewer. | Domain QA, research pass, low-risk worker if provider is already approved. | +| `minimax-m2.7` | Very cheap and fast enough for small bounded tasks. | Lower agentic-terminal/code confidence; do not use as sole reviewer for high-risk changes. | Low-risk docs/chore worker, cheap shadow QA. | + +## MODEL LEARNING LOOP + +The orchestrator must learn from each run without treating anecdotes as permanent truth. + +1. Before launch, record a `selection_snapshot` in `agents.json`: model id, provider, role, quality/cost/speed/context/tool-fit tiers, independence group, selection reason, approval ref, and source refs. +2. During the run, record heartbeats with wall time, context pressure, commands, checks, retries, and drift/blocker notes. +3. After role completion, update `outcome_summary`: accepted/partial/rejected/failed, wall time, estimated cost or token counters when available, evidence quality, reviewer findings, blind spots, and a one-line lesson. +4. On future assignments, prefer models with good local outcomes for the same role and risk class, but reserve some low-risk tasks for exploration when the queue has enough slack. +5. Promote a model only after repeated local wins: clean patch, low rework, good evidence, acceptable cost, and no repeated blind spots. +6. Demote a model for repeated scope drift, missing evidence, over-eager PASS verdicts, broken tool use, excessive latency, excessive cost, or repeated user/reviewer corrections. +7. Keep model lessons local to the repo/run unless the user asks to publish or distribute them. Do not commit `agents.json`. + +## OUTPUT HYGIENE + +- Strip `NODE_TLS_REJECT_UNAUTHORIZED` from spawned networked agents unless a role explicitly proves a temporary local TLS exception is required. +- Keep proxy settings explicit per launch. Do not rely on hidden parent-shell proxy state. +- Keep raw provider logs local. When recording evidence, store the final answer, command, exit code, and redacted diagnostics instead of full thought-bearing JSON. +- For direct Ollama probes, prefer `ollama run --think false --hidethinking ...` when only the final answer is needed. +- Gemini JSON includes token counters for thoughts in `stats`; those are usage metrics, not chain-of-thought text. Record counts only when useful. +- Grok JSON can include thought-like fields. Do not paste raw JSON into workflow artifacts without redaction. + +## DISTRIBUTION + +`workflow-orchestrator` is a default-exported command. It must move through the normal agent-surface pipeline instead of manual copies: + +```bash +node scripts/agent-surface.mjs build --target all +npm run check:commands +npm run check:generated +node scripts/agent-surface.mjs install --target codex --scope user --dry-run +node scripts/agent-surface.mjs install --target gemini-cli --scope user --dry-run +node scripts/agent-surface.mjs install --target cursor --scope user --dry-run +``` + +For live distribution, dry-run every supported user-scope target from the README first, then install only after explicit user approval. Do not edit generated target surfaces directly. + +Useful local probes: + +```bash +env -u NODE_TLS_REJECT_UNAUTHORIZED cursor agent models +env -u NODE_TLS_REJECT_UNAUTHORIZED cursor agent status +env -u NODE_TLS_REJECT_UNAUTHORIZED PATH=/opt/homebrew/bin:$PATH gemini --help +env -u NODE_TLS_REJECT_UNAUTHORIZED grok models +env -u NODE_TLS_REJECT_UNAUTHORIZED grok -m grok-build -p "Reply with OK only." --output-format json --max-turns 1 +ollama show kimi-k2.6:cloud +ollama show deepseek-v4-pro:cloud +ollama show glm-5.1:cloud +ollama show minimax-m2.7:cloud +``` + +Do not claim any provider or model ID is verified unless the current worker ran the probe and recorded the output as local evidence. diff --git a/commands/workflow-reviewer.md b/commands/workflow-reviewer.md index 1e9b0fd..5341061 100644 --- a/commands/workflow-reviewer.md +++ b/commands/workflow-reviewer.md @@ -47,6 +47,19 @@ For each entry in `worker.tasks_processed`, run this checklist independently: - Evidence binding: verify command evidence records `cmd`, `cwd`, command class, timeout, exit code, start time, duration, tree hash, stdout/stderr refs, hashes, and redaction status. - Patch isolation: verify each completed task has `patch_ref`, `patch_hash`, `pre_tree_hash`, `post_tree_hash`, `name_status_ref`, and `applies_cleanly: true` from `agent-surface workflow patch verify`. +## CROSS-DOMAIN QA CHECKLIST (completed batch) + +After the per-task checklist, review the completed batch as a codebase change, not just as a set of AC boxes: + +- Security and privacy: validate auth/authz, injection, path traversal, unsafe shell/process use, secret exposure, sensitive logs, unsafe defaults, and evidence redaction when touched. +- Documentation: confirm README, adapter docs, command help, schemas, examples, and operational notes match changed behavior. Missing docs for public behavior, config, CLI, security posture, or workflow semantics are at least `major`. +- Dependencies: confirm dependency additions/updates were approved, justified, reflected in lockfiles, and covered by vulnerability/license expectations. Phantom deps or unreviewed lockfile churn are `major` or `blocker`. +- Tests and gates: verify tests cover meaningful behavior and failure paths, not just snapshots or mock choreography. Deleted, weakened, skipped, or sabotaged gates are `blocker`. +- Config, CI, and deployment: inspect workflow YAML, package scripts, generated manifests, install paths, permissions, and release packaging when touched. +- Compatibility: check public CLI/API/config contracts, generated target paths, migration behavior, backward compatibility, and stale artifact cleanup. +- Maintainability: flag unnecessary complexity, duplicated policy, broad refactors hidden inside feature work, TODO placeholders, and scope creep outside the BOSS filescope. +- Observability and operations: when behavior affects installs, automation, security, or deployments, confirm errors, logs, dry-runs, backups, and recovery paths remain understandable. + After per-task review, also check **batch-level invariants**: - Worker handoff integrity: `worker.workflow.run_id` matches `boss.workflow.run_id`. Mismatch = stale worker = REJECT entire batch. diff --git a/scripts/agent-surface.mjs b/scripts/agent-surface.mjs index 22296b9..8bbb709 100755 --- a/scripts/agent-surface.mjs +++ b/scripts/agent-surface.mjs @@ -1974,7 +1974,7 @@ function parseCommand(file, text) { risk: commandRiskFromName(name), packs: ["default"], default_export: true, - approval_classes: [], + approval_classes: commandApprovalClassesFromName(name), description: null, }; const frontmatterErrors = []; @@ -2533,6 +2533,11 @@ function commandRiskFromName(name) { return "safe"; } +function commandApprovalClassesFromName(name) { + if (name === "workflow-orchestrator") return ["network"]; + return []; +} + async function directories(base) { if (!(await exists(base))) return []; diff --git a/tests/agent-surface.test.mjs b/tests/agent-surface.test.mjs index 951f2b0..08350da 100644 --- a/tests/agent-surface.test.mjs +++ b/tests/agent-surface.test.mjs @@ -67,12 +67,12 @@ assert.match(run(["check", "commands"]), /commands check: ok/); const inventory = run(["inventory"]); assert.match(inventory, /^rules: 11$/m); -assert.match(inventory, /^commands: 61$/m); +assert.match(inventory, /^commands: 62$/m); assert.match(inventory, /^schemas: 11$/m); const defaultRegistry = JSON.parse(run(["commands", "--json"])); assert.equal(defaultRegistry.pack, "default"); -assert.equal(defaultRegistry.count, 59); +assert.equal(defaultRegistry.count, 60); assert.equal(defaultRegistry.commands.some((command) => command.name === "boot-facade"), false); const flowCommand = defaultRegistry.commands.find((command) => command.name === "flow"); assert.ok(flowCommand); @@ -95,6 +95,11 @@ assert.equal(qaTraceCommand.risk, "security-sensitive"); const workflowDoctorCommand = defaultRegistry.commands.find((command) => command.name === "workflow-doctor"); assert.ok(workflowDoctorCommand); assert.equal(workflowDoctorCommand.risk, "safe"); +const workflowOrchestratorCommand = defaultRegistry.commands.find((command) => command.name === "workflow-orchestrator"); +assert.ok(workflowOrchestratorCommand); +assert.equal(workflowOrchestratorCommand.risk, "writes"); +assert.deepEqual(workflowOrchestratorCommand.approval_classes, ["network"]); +assert.equal(workflowOrchestratorCommand.metadata_source, "inferred"); const shipCommands = JSON.parse(run(["commands", "--phase", "ship", "--json"])); assert.equal(shipCommands.filters.phase, "ship"); assert.equal(shipCommands.commands.every((command) => command.phase === "ship"), true); @@ -102,10 +107,10 @@ const writeCommands = JSON.parse(run(["commands", "--risk", "writes", "--json"]) assert.equal(writeCommands.filters.risk, "writes"); assert.equal(writeCommands.commands.some((command) => command.name === "dev-feature"), true); const allRegistry = JSON.parse(run(["commands", "--pack", "all", "--json"])); -assert.equal(allRegistry.count, 61); +assert.equal(allRegistry.count, 62); assert.equal(allRegistry.commands.some((command) => command.name === "boot-facade"), true); const destructiveRegistry = JSON.parse(run(["commands", "--pack", "destructive", "--json"])); -assert.equal(destructiveRegistry.count, 60); +assert.equal(destructiveRegistry.count, 61); assert.equal(destructiveRegistry.commands.some((command) => command.name === "ops-nuke"), true); assert.equal(destructiveRegistry.commands.some((command) => command.name === "boot-facade"), false); @@ -140,14 +145,26 @@ for (const scenario of ["python-source", "python-tooling", "rust-source", "go-ci run(["build", "--target", "all"]); const generated = files(path.join(root, "dist")); -assert.equal(generated.length, 625); +assert.equal(generated.length, 635); assertGeminiTomlParses(); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "claude-code", ".claude", "commands", "flow", "flow.md"))), true); +assert.equal( + generated.some((file) => file.endsWith(path.join("dist", "claude-code", ".claude", "commands", "workflow", "orchestrator.md"))), + true, +); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "claude-code", ".claude", "commands", "boot", "facade.md"))), false); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "codex", ".agents", "skills", "flow", "SKILL.md"))), true); +assert.equal( + generated.some((file) => file.endsWith(path.join("dist", "codex", ".agents", "skills", "workflow-orchestrator", "SKILL.md"))), + true, +); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "codex", ".agents", "skills", "flow", "agents", "openai.yaml"))), true); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "codex", ".codex", "AGENTS.md"))), true); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "codex", ".agents", "skills", "boot-facade", "SKILL.md"))), false); +assert.equal( + generated.some((file) => file.endsWith(path.join("dist", "gemini-cli", ".gemini", "commands", "workflow", "orchestrator.toml"))), + true, +); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "gemini-cli", ".gemini", "commands", "boot", "facade.toml"))), false); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "gemini-cli", ".gemini", "commands", "ops", "nuke.toml"))), false); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "gemini-cli", ".gemini", "GEMINI.md"))), true); @@ -164,15 +181,15 @@ assert.equal(generated.some((file) => file.endsWith(path.join("dist", "vscode", assert.equal(generated.some((file) => file.endsWith(path.join("dist", "opencode", ".config", "opencode", "AGENTS.md"))), true); assert.equal(generated.some((file) => file.endsWith(path.join("dist", "trae", ".trae", "user_rules.md"))), true); const generatedCheck = run(["check", "generated"]); -assert.match(generatedCheck, /claude-code: generated outputs 120 ok/); -assert.match(generatedCheck, /kilo: generated outputs 71 ok/); +assert.match(generatedCheck, /claude-code: generated outputs 122 ok/); +assert.match(generatedCheck, /kilo: generated outputs 72 ok/); assert.match(generatedCheck, /copilot: generated outputs 1 ok/); assert.match(generatedCheck, /generated check: ok/); const copilotGeneratedCheck = run(["check", "generated", "--target", "copilot"]); assert.match(copilotGeneratedCheck, /copilot: generated outputs 1 ok/); const allPackBuild = run(["build", "--target", "gemini-cli", "--pack", "all"]); -assert.match(allPackBuild, /gemini-cli: 125 outputs rendered \(pack: all\)/); +assert.match(allPackBuild, /gemini-cli: 127 outputs rendered \(pack: all\)/); assertGeminiTomlParses(); const facadeToml = readFileSync(path.join(root, "dist", "gemini-cli", ".gemini", "commands", "boot", "facade.toml"), "utf8"); const nukeToml = readFileSync(path.join(root, "dist", "gemini-cli", ".gemini", "commands", "ops", "nuke.toml"), "utf8"); @@ -201,6 +218,7 @@ assert.match(clinePlan, /^target: cline$/m); assert.match(clinePlan, /^pack: default$/m); assert.match(clinePlan, /^root source: explicit --dest$/m); assert.match(clinePlan, /\.clinerules\/workflows\/workflow-boss\.md <- commands\/workflow-boss\.md/); +assert.match(clinePlan, /\.clinerules\/workflows\/workflow-orchestrator\.md <- commands\/workflow-orchestrator\.md/); assert.match(clinePlan, /\.clinerules\/agent-surface\.md <- rules\/\*\.mdc/); assert.match(clinePlan, /\.agent-surface\/cline-manifest\.json/); @@ -258,30 +276,30 @@ const liveDest = "/tmp/agent-surface-live"; rmSync(liveDest, { recursive: true, force: true }); const liveInstall = run(["install", "--target", "cline", "--dest", liveDest]); assert.match(liveInstall, /^installed:$/m); -assert.match(liveInstall, /wrote: 60/); +assert.match(liveInstall, /wrote: 61/); assert.match(readFileSync(path.join(liveDest, ".clinerules", "workflows", "workflow-boss.md"), "utf8"), /^## OBJECTIVE/); const liveManifest = JSON.parse(readFileSync(path.join(liveDest, ".agent-surface", "cline-manifest.json"), "utf8")); assert.equal(liveManifest.pack, "default"); -assert.equal(liveManifest.managed.length, 60); +assert.equal(liveManifest.managed.length, 61); assert.equal(liveManifest.managed[0].managed_by, "agent-surface"); rmSync(liveDest, { recursive: true, force: true }); const claudeLiveDest = "/tmp/agent-surface-claude-live"; rmSync(claudeLiveDest, { recursive: true, force: true }); const claudeLiveInstall = run(["install", "--target", "claude-code", "--dest", claudeLiveDest]); -assert.match(claudeLiveInstall, /wrote: 120/); +assert.match(claudeLiveInstall, /wrote: 122/); assert.match(readFileSync(path.join(claudeLiveDest, ".claude", "commands", "workflow", "boss.md"), "utf8"), /^## OBJECTIVE/); const claudeLiveManifest = JSON.parse(readFileSync(path.join(claudeLiveDest, ".agent-surface", "claude-code-manifest.json"), "utf8")); -assert.equal(claudeLiveManifest.managed.length, 120); +assert.equal(claudeLiveManifest.managed.length, 122); rmSync(claudeLiveDest, { recursive: true, force: true }); const codexLiveDest = "/tmp/agent-surface-codex-live"; rmSync(codexLiveDest, { recursive: true, force: true }); const codexLiveInstall = run(["install", "--target", "codex", "--dest", codexLiveDest]); -assert.match(codexLiveInstall, /wrote: 119/); +assert.match(codexLiveInstall, /wrote: 121/); assert.match(readFileSync(path.join(codexLiveDest, ".agents", "skills", "workflow-boss", "SKILL.md"), "utf8"), /^---\nname: workflow-boss\n/); const codexLiveManifest = JSON.parse(readFileSync(path.join(codexLiveDest, ".agent-surface", "codex-manifest.json"), "utf8")); -assert.equal(codexLiveManifest.managed.length, 119); +assert.equal(codexLiveManifest.managed.length, 121); rmSync(codexLiveDest, { recursive: true, force: true }); const unmanagedDest = "/tmp/agent-surface-unmanaged"; @@ -505,17 +523,22 @@ const globalProjectScope = status(["install", "--target", "cursor", "--scope", " assert.notEqual(globalProjectScope.status, 0); assert.match(globalProjectScope.stderr, /supports --scope user only/); -const clineUserScope = status(["install", "--target", "cline", "--scope", "user", "--dry-run"]); +const userScopeHome = "/tmp/agent-surface-user-scope-home"; +rmSync(userScopeHome, { recursive: true, force: true }); +mkdirSync(userScopeHome, { recursive: true }); +const userScopeEnv = { ...process.env, HOME: userScopeHome }; +const clineUserScope = status(["install", "--target", "cline", "--scope", "user", "--dry-run"], { env: userScopeEnv }); assert.equal(clineUserScope.status, 0, `${clineUserScope.stdout}${clineUserScope.stderr}`); assert.match(clineUserScope.stdout, /Documents\/Cline\/Workflows\/workflow-boss\.md <- commands\/workflow-boss\.md/); -const kiloUserScope = status(["install", "--target", "kilo", "--scope", "user", "--dry-run"]); +const kiloUserScope = status(["install", "--target", "kilo", "--scope", "user", "--dry-run"], { env: userScopeEnv }); assert.equal(kiloUserScope.status, 0, `${kiloUserScope.stdout}${kiloUserScope.stderr}`); assert.match(kiloUserScope.stdout, /\.config\/kilo\/commands\/workflow-boss\.md <- commands\/workflow-boss\.md/); assert.match(kiloUserScope.stdout, /\.config\/kilo\/AGENTS\.md <- rules\/\*\.mdc/); assert.match(kiloUserScope.stdout, /\.config\/kilo\/rules\/00-precedence-and-safety\.md <- rules\/00-precedence-and-safety\.mdc/); assert.match(kiloUserScope.stdout, /\.config\/kilo\/rules\/14-lang-shell\.md <- rules\/14-lang-shell\.mdc/); assert.match(kiloUserScope.stdout, /\.config\/kilo\/kilo\.jsonc instructions \+= \.\/rules\/00-precedence-and-safety\.md, .*\.\/rules\/14-lang-shell\.md/); +rmSync(userScopeHome, { recursive: true, force: true }); const invalidKiloDest = "/tmp/agent-surface-kilo-invalid"; rmSync(invalidKiloDest, { recursive: true, force: true });