diff --git a/README.md b/README.md index 209e9ec..a2bb869 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,19 @@ It is built for useful, bounded work: data analysis, document generation, API wo > > **Platform note:** HyperAgent requires hardware virtualization: Linux with KVM, Azure Linux with MSHV, Windows with WHP, or WSL2 with KVM. It does not currently run on macOS [because of this Hyperlight issue](https://github.com/hyperlight-dev/hyperlight/issues/45). +## Quick Install + +```bash +# Authenticate with GitHub (Copilot access required) +gh auth login + +# Install and run +npm install -g @hyperlight-dev/hyperagent +hyperagent +``` + +Requires Node.js 22+ and hardware virtualization (Linux/KVM, Windows/WHP, Azure Linux/MSHV, or WSL2/KVM). For Docker, building from source, and full prerequisites, see [Install and Run](#install-and-run) below. + ## Why HyperAgent? Most agent CLIs are powerful because they can touch your machine directly: shell commands, file edits, network calls, local tools, credentials, and long-lived process state. That is useful, but it also means a bad instruction, hallucinated command, or prompt-injected webpage can become real host activity very quickly. @@ -121,14 +134,14 @@ User prompt The sandbox has no direct filesystem, network, shell, or process access. Capabilities are added deliberately: -| Capability | How it is exposed | -| ---------------- | ---------------------------------------------------------------- | -| Files | `fs-read` and `fs-write` plugins with path jails | -| HTTP | `fetch` plugin with domain allowlists and SSRF checks | +| Capability | How it is exposed | +| ---------------- | ------------------------------------------------------------------------- | +| Files | `fs-read` and `fs-write` plugins with path jails | +| HTTP | `fetch` plugin with domain allowlists and SSRF checks | | Bash commands | `execute_bash` — sandboxed pure-JS interpreter (ls, grep, jq, curl, etc.) | -| Reusable code | `ha:*` system and user modules | -| External systems | MCP servers exposed as typed `host:mcp-*` modules | -| Bigger jobs | Profiles that raise limits; profile tools can enable plugin sets | +| Reusable code | `ha:*` system and user modules | +| External systems | MCP servers exposed as typed `host:mcp-*` modules | +| Bigger jobs | Profiles that raise limits; profile tools can enable plugin sets | ## Built-In Modules diff --git a/src/agent/approach-resolver.ts b/src/agent/approach-resolver.ts index 1023053..9d8f37e 100644 --- a/src/agent/approach-resolver.ts +++ b/src/agent/approach-resolver.ts @@ -13,10 +13,23 @@ import { loadPatterns } from "./pattern-loader.js"; import { loadModule, type ModuleHints } from "./module-store.js"; // ── MCP server name → CLI setup command mapping ────────────────────── -// Maps an MCP server name (as declared in skills) to the CLI flag that -// configures it. Used by formatGuidance() to show actionable hints. +// Maps an MCP server name (the key written into ~/.hyperagent/config.json +// under `mcpServers`) to the CLI flag that configures it. When a skill +// requires a server that is NOT yet configured, formatGuidance() uses +// this table to recommend the specific shortcut to the LLM (and through +// it, to the user). Servers absent from this table fall back to the +// generic "edit ~/.hyperagent/config.json" guidance — we deliberately +// don't suggest a `--mcp-setup-${name}` flag for unsupported names +// because that flag does NOT exist and would set the user up for failure. +// +// Keep in sync with the setup functions in src/agent/mcp/setup-commands.ts +// and the CLI cases in src/agent/cli-parser.ts. const MCP_SETUP_COMMANDS: Record = { "fabric-rti-mcp": "--mcp-setup-fabric-rti", + everything: "--mcp-setup-everything", + github: "--mcp-setup-github", + filesystem: "--mcp-setup-filesystem", + workiq: "--mcp-setup-workiq", }; // ── Types ──────────────────────────────────────────────────────────── @@ -253,7 +266,42 @@ const GENERIC_GUIDANCE: MaterialisedGuidance = { export function formatGuidance(guidance: MaterialisedGuidance): string { const parts: string[] = ["--- TASK GUIDANCE ---"]; - // Anti-patterns and rules go FIRST — the LLM is most likely to follow + // ── Unconfigured MCP servers go FIRST as a blocker ────────────── + // A missing prerequisite is the single most important thing the LLM + // needs to surface to the user. Buried under "Rules:" or "Modules:" + // it gets ignored — the model has been observed to skip the hint and + // hallucinate config.json snippets or non-existent CLI flags instead. + // Promoting this block to the top, with strong "TELL THE USER" wording, + // measurably improves the user-visible recommendation quality. + const missingMcp = guidance.mcpStatus.filter((s) => !s.configured); + if (missingMcp.length > 0) { + parts.push("🛑 MISSING PREREQUISITES — tell the user how to fix this:"); + for (const s of missingMcp) { + const setupFlag = MCP_SETUP_COMMANDS[s.name]; + if (setupFlag) { + // Explicitly supported — recommend the specific shortcut. + parts.push( + ` ❌ MCP server "${s.name}" is required but not configured.`, + ` SHORTCUT: have the user run \`hyperagent ${setupFlag}\` ` + + `(exits after writing config, then restart hyperagent).`, + ); + } else { + // Not in the supported-shortcuts table — give honest generic + // guidance. DO NOT suggest a `--mcp-setup-${name}` flag here; + // that flag does not exist and the user will hit "unknown + // option" if you say it does. + parts.push( + ` ❌ MCP server "${s.name}" is required but not configured.`, + ` This server has no built-in setup shortcut. Tell the ` + + `user to add it to \`~/.hyperagent/config.json\` under ` + + `\`mcpServers\` (see https://modelcontextprotocol.io for ` + + `the entry shape), then restart hyperagent.`, + ); + } + } + } + + // Anti-patterns and rules — the LLM is most likely to follow // instructions at the top of the context injection. if (guidance.antiPatterns.length > 0) { parts.push("⚠️ DO NOT:"); @@ -278,15 +326,13 @@ export function formatGuidance(guidance: MaterialisedGuidance): string { `Plugins: ${guidance.plugins.join(", ")} — enable via manage_plugin or apply_profile`, ); } - if (guidance.mcpStatus.length > 0) { + // MCP server status block — only shows CONFIGURED servers here. + // Unconfigured ones are surfaced above as missing prerequisites. + const configuredMcp = guidance.mcpStatus.filter((s) => s.configured); + if (configuredMcp.length > 0) { parts.push("MCP Servers:"); - for (const s of guidance.mcpStatus) { - if (!s.configured) { - const setupFlag = MCP_SETUP_COMMANDS[s.name] ?? `--mcp-setup-${s.name}`; - parts.push( - ` ❌ ${s.name} — not configured. Run: hyperagent ${setupFlag}`, - ); - } else if (s.state === "connected") { + for (const s of configuredMcp) { + if (s.state === "connected") { parts.push( ` ✅ ${s.name} — connected (${s.toolCount ?? 0} tools). Import from host:mcp-${s.name}`, ); diff --git a/src/agent/index.ts b/src/agent/index.ts index 68291ef..af5bf6b 100644 --- a/src/agent/index.ts +++ b/src/agent/index.ts @@ -110,6 +110,8 @@ import { getUserSkillsDir, writeUserSkill, userSkillExists, + systemSkillExists, + validateSkillName, type SkillData, } from "./skill-writer.js"; import { validatePath } from "../../plugins/shared/path-jail.js"; @@ -709,8 +711,13 @@ const MAX_TOOL_HISTORY = 200; /** * Create the sandbox tool instance. Configuration is resolved from * environment variables (see shared/sandbox-tool.js for details). + * + * `debugLog` routes the sandbox's verbose lifecycle traces (the noisy + * `[sandbox] setPlugins`, `invalidateSandboxWithSave`, etc. lines) into + * `~/.hyperagent/logs/agent-debug-*.log` when --debug is on, keeping + * them OFF the user-facing terminal. */ -const sandbox = createSandboxTool(); +const sandbox = createSandboxTool({ debugLog }); /** * Session transcript recorder. Created at module level so the SIGINT @@ -2579,6 +2586,7 @@ async function getBashSandbox() { outputBufferKb: 2048, cpuTimeoutMs: 10000, wallClockTimeoutMs: 15000, + debugLog, }); // Load ONLY the bash module — no pptx/pdf/xlsx @@ -3950,6 +3958,129 @@ const applyProfileTool = defineTool("apply_profile", { }, }); +// ── Profile preview rendering ───────────────────────────────────────── +// +// `applyProfileImpl` shows a preview of which limits will change and +// which plugins will be enabled before prompting the user for approval. +// We render this preview in one of two formats: +// +// 1. Plain indented console block — the default. Works on any +// terminal, no markdown dependency, easy to grep in transcripts. +// +// 2. Markdown table via `renderMarkdown` — only when the user has +// opted in via `/markdown on`. marked-terminal renders the table +// with aligned columns and bold headers, which is much easier to +// scan than a flat list of `cpu: 1000ms → 2000ms` lines. +// +// The data is collected up-front into `LimitChange` records so the two +// renderers share a single source of truth; if you add a new limit to +// the schema, add it once to the `applyProfileImpl` collection block +// and both renderers pick it up automatically. + +/** A single before/after row in the profile-apply preview. */ +type LimitChange = { + /** Short display name shown in the leftmost column ("cpu", "heap", …). */ + readonly name: string; + /** Current value rendered with its unit (e.g. "1000ms", "16MB"). */ + readonly before: string; + /** Target value rendered with its unit (e.g. "2000ms", "32MB"). */ + readonly after: string; + /** True if the target value actually exceeds the current value. */ + readonly willChange: boolean; +}; + +/** + * Build a `LimitChange` row, annotating no-op rows with "(no change)" + * in the After column so the table stays rectangular without forcing + * the caller to assemble a sentinel string. + */ +function buildLimitChange( + name: string, + before: string, + after: string, + willChange: boolean, +): LimitChange { + return { + name, + before, + after: willChange ? after : `${after} (no change)`, + willChange, + }; +} + +/** + * Render the profile-apply preview to the terminal. + * + * Routes to the markdown-table renderer when `state.markdownEnabled` is + * true; otherwise emits the legacy indented plain-text block. Always + * starts with a leading newline so the preview is visually separated + * from whatever spinner / tool output came before. + */ +function renderProfilePreview( + profileLabel: string, + limitChanges: readonly LimitChange[], + pluginNames: readonly string[], +): void { + if (state.markdownEnabled) { + renderProfilePreviewMarkdown(profileLabel, limitChanges, pluginNames); + return; + } + renderProfilePreviewPlain(profileLabel, limitChanges, pluginNames); +} + +/** Legacy plain-text renderer — preserved verbatim for non-markdown sessions. */ +function renderProfilePreviewPlain( + profileLabel: string, + limitChanges: readonly LimitChange[], + pluginNames: readonly string[], +): void { + console.log(`\n ${C.warn("📋 Profile:")} ${C.tool(profileLabel)}`); + if (limitChanges.length > 0) { + console.log(` Limits:`); + for (const c of limitChanges) { + console.log(` ${c.name}: ${c.before} → ${c.after}`); + } + } + if (pluginNames.length > 0) { + console.log(` Plugins: ${pluginNames.join(", ")}`); + } else { + console.log(` Plugins: none`); + } +} + +/** + * Markdown-table renderer. Produces a GitHub-flavoured markdown table + * that marked-terminal converts to a unicode box-drawing table with + * bold/coloured headers. Falls back to the plain renderer if there + * are no limit changes to show (a one-row table looks awkward and the + * plain "Plugins:" line is fine on its own). + */ +function renderProfilePreviewMarkdown( + profileLabel: string, + limitChanges: readonly LimitChange[], + pluginNames: readonly string[], +): void { + if (limitChanges.length === 0) { + renderProfilePreviewPlain(profileLabel, limitChanges, pluginNames); + return; + } + const lines: string[] = []; + lines.push(`📋 **Profile:** \`${profileLabel}\``); + lines.push(""); + lines.push("| Limit | Before | After |"); + lines.push("|---|---|---|"); + for (const c of limitChanges) { + lines.push(`| ${c.name} | ${c.before} | ${c.after} |`); + } + lines.push(""); + lines.push( + pluginNames.length > 0 + ? `**Plugins:** ${pluginNames.join(", ")}` + : `**Plugins:** none`, + ); + process.stdout.write("\n" + renderMarkdown(lines.join("\n")) + "\n"); +} + /** Internal implementation for apply_profile — called under lock. */ async function applyProfileImpl( names: string[], @@ -3972,60 +4103,70 @@ async function applyProfileImpl( // Get current config for comparison const currentConfig = getEffectiveConfig(sandbox, state); - // Build a summary showing profile values vs current values - const limitChanges: string[] = []; + // Build a structured summary of profile values vs current values so + // we can render the preview either as a plain console block (default) + // or as a markdown table when the user has opted in to markdown via + // `/markdown on`. Keeping the data structured (instead of pre-joined + // strings) is what makes the table renderer possible. + const limitChanges: LimitChange[] = []; if (merged.limits.cpuTimeoutMs !== undefined) { - const current = currentConfig.cpuTimeoutMs; - const willChange = merged.limits.cpuTimeoutMs > current; limitChanges.push( - willChange - ? `cpu: ${current}ms → ${merged.limits.cpuTimeoutMs}ms` - : `cpu: ${merged.limits.cpuTimeoutMs}ms (already ≥)`, + buildLimitChange( + "cpu", + `${currentConfig.cpuTimeoutMs}ms`, + `${merged.limits.cpuTimeoutMs}ms`, + merged.limits.cpuTimeoutMs > currentConfig.cpuTimeoutMs, + ), ); } if (merged.limits.wallTimeoutMs !== undefined) { - const current = currentConfig.wallTimeoutMs; - const willChange = merged.limits.wallTimeoutMs > current; limitChanges.push( - willChange - ? `wall: ${current}ms → ${merged.limits.wallTimeoutMs}ms` - : `wall: ${merged.limits.wallTimeoutMs}ms (already ≥)`, + buildLimitChange( + "wall", + `${currentConfig.wallTimeoutMs}ms`, + `${merged.limits.wallTimeoutMs}ms`, + merged.limits.wallTimeoutMs > currentConfig.wallTimeoutMs, + ), ); } if (merged.limits.heapMb !== undefined) { - const current = currentConfig.heapMb; - const willChange = merged.limits.heapMb > current; limitChanges.push( - willChange - ? `heap: ${current}MB → ${merged.limits.heapMb}MB` - : `heap: ${merged.limits.heapMb}MB (already ≥)`, + buildLimitChange( + "heap", + `${currentConfig.heapMb}MB`, + `${merged.limits.heapMb}MB`, + merged.limits.heapMb > currentConfig.heapMb, + ), ); } if (merged.limits.scratchMb !== undefined) { - const current = currentConfig.scratchMb; - const willChange = merged.limits.scratchMb > current; limitChanges.push( - willChange - ? `scratch: ${current}MB → ${merged.limits.scratchMb}MB` - : `scratch: ${merged.limits.scratchMb}MB (already ≥)`, + buildLimitChange( + "scratch", + `${currentConfig.scratchMb}MB`, + `${merged.limits.scratchMb}MB`, + merged.limits.scratchMb > currentConfig.scratchMb, + ), ); } if (merged.limits.inputBufferKb !== undefined) { - const current = currentConfig.inputBufferKb; - const willChange = merged.limits.inputBufferKb > current; limitChanges.push( - willChange - ? `input: ${current}KB → ${merged.limits.inputBufferKb}KB` - : `input: ${merged.limits.inputBufferKb}KB (already ≥)`, + buildLimitChange( + "input", + `${currentConfig.inputBufferKb}KB`, + `${merged.limits.inputBufferKb}KB`, + merged.limits.inputBufferKb > currentConfig.inputBufferKb, + ), ); } if (merged.limits.outputBufferKb !== undefined) { - const current = currentConfig.outputBufferKb; - const willChange = merged.limits.outputBufferKb > current; limitChanges.push( - willChange - ? `output: ${current}KB → ${merged.limits.outputBufferKb}KB` - : `output: ${merged.limits.outputBufferKb}KB (already ≥)`, + buildLimitChange( + "output", + `${currentConfig.outputBufferKb}KB`, + `${merged.limits.outputBufferKb}KB`, + merged.limits.outputBufferKb > currentConfig.outputBufferKb, + ), ); } @@ -4040,18 +4181,7 @@ async function applyProfileImpl( ? merged.appliedProfiles[0] : merged.appliedProfiles.join(" + "); - console.log(`\n ${C.warn("📋 Profile:")} ${C.tool(profileLabel)}`); - if (limitChanges.length > 0) { - console.log(` Limits:`); - for (const change of limitChanges) { - console.log(` ${change}`); - } - } - if (pluginNames.length > 0) { - console.log(` Plugins: ${pluginNames.join(", ")}`); - } else { - console.log(` Plugins: none`); - } + renderProfilePreview(profileLabel, limitChanges, pluginNames); await drainAndWarn(rl); const approval = state.autoApprove @@ -5552,13 +5682,35 @@ const generateSkillTool = defineTool("generate_skill", { return { success: false, error: "No readline available" }; } - // Refuse silent overwrite — the LLM must opt in explicitly. - if (!params.overwrite && userSkillExists(params.name)) { + // Detect collisions with BOTH user skills and the bundled + // built-in skills under /skills/. + // + // Without this check the user-skill loader would silently + // shadow the curated built-in (see Bug 2: kql-expert) — the + // LLM hallucinates a half-formed replacement and the carefully + // authored bundled skill becomes unreachable. Require an + // explicit overwrite=true to make the destructive intent visible. + const systemSkillsDir = join(CONTENT_ROOT, "skills"); + const userExists = userSkillExists(params.name); + const systemExists = systemSkillExists(params.name, systemSkillsDir); + + if (!params.overwrite && userExists) { return { success: false, error: `Skill "${params.name}" already exists. Set overwrite=true to replace it.`, }; } + if (!params.overwrite && systemExists) { + return { + success: false, + error: + `Skill "${params.name}" is a built-in system skill. ` + + `Saving a user skill with this name would SHADOW the curated ` + + `built-in for this user only — almost always not what you want. ` + + `Pick a different name (e.g. "${params.name}-custom"), or set ` + + `overwrite=true to deliberately shadow the built-in.`, + }; + } // Build the typed payload — strip undefined fields so the YAML stays clean. const skillData: SkillData = { @@ -5588,9 +5740,26 @@ const generateSkillTool = defineTool("generate_skill", { // Surface the overwrite path explicitly — the LLM passing // `overwrite=true` is necessary but not sufficient. The user // gets a chance to refuse before we replace existing content. - const isOverwrite = - params.overwrite === true && userSkillExists(params.name); - if (isOverwrite) { + // Shadowing a built-in is louder because it's almost always + // a regrettable action (the curated skill becomes unreachable + // for this user only — see Bug 2). + const isOverwrite = params.overwrite === true && userExists; + const isShadowingBuiltin = + params.overwrite === true && systemExists && !userExists; + if (isShadowingBuiltin) { + console.log( + `\n ${C.err("⚠️ SHADOW built-in skill:")} ${C.tool(params.name)}`, + ); + console.log( + ` ${C.warn("This will OVERRIDE the curated built-in skill for THIS USER only.")}`, + ); + console.log( + ` ${C.warn("The bundled version stays on disk but stops loading.")}`, + ); + console.log( + ` ${C.warn("Consider using a different name unless you really mean to override.")}`, + ); + } else if (isOverwrite) { console.log( `\n ${C.warn("⚠️ Overwrite existing user skill:")} ${C.tool(params.name)}`, ); @@ -5611,9 +5780,11 @@ const generateSkillTool = defineTool("generate_skill", { } await drainAndWarn(rl); - const promptLabel = isOverwrite - ? ` ${C.dim("Overwrite skill? [y/n] ")}` - : ` ${C.dim("Save skill? [y/n] ")}`; + const promptLabel = isShadowingBuiltin + ? ` ${C.dim("Shadow built-in skill? [y/n] ")}` + : isOverwrite + ? ` ${C.dim("Overwrite skill? [y/n] ")}` + : ` ${C.dim("Save skill? [y/n] ")}`; const approval = state.autoApprove ? "y" : await promptUser(rl, promptLabel); @@ -5673,6 +5844,27 @@ const generateSkillTool = defineTool("generate_skill", { console.error( ` ${C.ok("📚 Skill saved:")} ${params.name} → ${skillPath}`, ); + + // Hot-reload the SDK's skill registry so the freshly-written + // skill is invocable on the very next turn — without this the + // SDK's `ensureSkillsLoaded()` cache stays stale until process + // restart and `/` lands as plain text, with the model + // improvising instead of using the SKILL.md body. Best-effort: + // a reload failure is logged but does NOT roll back the save — + // the file is on disk and `/skills reload` is one keystroke + // away as a manual fallback. + if (state.activeSession) { + try { + await state.activeSession.rpc.skills.reload(); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error( + ` ${C.warn("⚠️")} Skill saved but live reload failed (${msg}). ` + + `Run ${C.tool("/skills reload")} to make it invocable.`, + ); + } + } + return { success: true, skill: { @@ -5882,6 +6074,14 @@ function buildSessionConfig() { "mcp_server_info", "mcp_tool_info", "manage_mcp", + // SDK built-in "skill" tool — required so / actually + // injects the matching SKILL.md body into the conversation. The + // SDK loads skills from `skillDirectories` above, but the model + // can't invoke them unless the tool itself is whitelisted here. + // Without this, /kql-expert lands as raw text and the model + // improvises a generic greeting instead of using the curated + // guidance. + "skill", // Conditionally expose tuning tool ...(state.tuneEnabled ? ["llm_thought"] : []), ], @@ -6365,22 +6565,20 @@ async function processMessage( }; if (state.markdownEnabled && state.streamedText) { - // Markdown mode: output was buffered (not streamed). Render now - // if it looks like markdown; otherwise print as-is to avoid - // mangling plain prose with ANSI escapes. - let output: string; - if (looksLikeMarkdown(state.streamedText)) { - output = renderMarkdown(state.streamedText); - } else { - output = state.streamedText; - } - // Replace [[file:path]] markers before printing so ANSI codes + // Markdown mode: output was buffered (not streamed). The user has + // explicitly opted in via the default-on flag or `/markdown on` — + // render unconditionally. Earlier versions gated on looksLikeMarkdown() + // to avoid "mangling plain prose", but that produced inconsistent + // output (some turns rendered, others raw) and surprised users who + // had opted in. marked-terminal handles plain prose gracefully. + let output = renderMarkdown(state.streamedText); + // Replace [[file:path]] markers AFTER rendering so ANSI codes // from renderMarkdown don't split the markers. output = linkifyFiles(output, fsWriteBase, trackFile); console.log(output); } else if (!state.streamedContent && content) { - // Non-streamed fallback (rare) — render through markdown if enabled - if (state.markdownEnabled && looksLikeMarkdown(content)) { + // Non-streamed fallback (rare) — same opt-in logic as above. + if (state.markdownEnabled) { let rendered = renderMarkdown(content); rendered = linkifyFiles(rendered, fsWriteBase, trackFile); console.log(rendered); @@ -6651,7 +6849,9 @@ async function main(): Promise { if (state.markdownEnabled) { const mdRows = configRows.map(([k, v]) => `| ${k} | ${v} |`).join("\n"); const table = `| Setting | Value |\n|---------|-------|\n${mdRows}`; - console.log(` **Configuration:**`); + // Use ANSI bold directly — console.log doesn't pass through the + // markdown renderer, so `**foo**` would print raw asterisks. + console.log(` ${bold}Configuration:${reset}`); console.log(renderMarkdown(table)); } else { console.log(` ${bold}Configuration:${reset}`); @@ -6963,6 +7163,33 @@ async function main(): Promise { // suggestion acceptance is stale. if (trimmed.startsWith("/")) { pendingContinuation = null; + // Bug 3: rewrite "/skills " → "/" so the Copilot + // SDK's skill-invocation grammar matches. Previously the raw + // "/skills " string was forwarded; the SDK couldn't + // parse it as a skill invocation and the LLM interpreted it + // as natural language — sometimes calling generate_skill to + // SAVE a same-named skill, shadowing the built-in. + // + // We gate the rewrite on `validateSkillName(token) === null` so: + // • Subcommands (info|edit|delete|list|reload) are left alone + // because they live in RESERVED_SKILL_NAMES — the canonical + // list used by validateSkillName. Adding a new subcommand + // only requires updating RESERVED_SKILL_NAMES, not a parallel + // hardcoded set here that could drift (PR #151 review found + // `/skills reload` was getting rewritten to `/reload` before + // this fix). + // • Path-traversal-shaped tokens ("../etc", "a/b") are also + // rejected at the rewrite step — defence in depth even though + // downstream handlers re-validate. + const skillsParts = trimmed.split(/\s+/); + if ( + skillsParts[0] === "/skills" && + skillsParts.length === 2 && + skillsParts[1] && + validateSkillName(skillsParts[1].toLowerCase()) === null + ) { + trimmed = `/${skillsParts[1]}`; + } const handled = await handleSlashCommand(trimmed, rl); if (handled) continue; // Not handled — could be a skill (/). diff --git a/src/agent/markdown-renderer.ts b/src/agent/markdown-renderer.ts index b92cd57..9fd8644 100644 --- a/src/agent/markdown-renderer.ts +++ b/src/agent/markdown-renderer.ts @@ -39,16 +39,57 @@ hljsInstance.registerLanguage("kql", kqlLanguage); // // The @types/marked-terminal declaration mis-types the return as // `TerminalRenderer`; cast through `unknown` to the correct shape. -const localMarked = new Marked( - markedTerminal({ - // Indent code blocks for visual separation - tab: 2, - // Show URLs inline rather than as footnotes - showSectionPrefix: true, - // Convert HTML entities back to characters - unescape: true, - }) as unknown as MarkedExtension, -); +const terminalExt = markedTerminal({ + // Indent code blocks for visual separation + tab: 2, + // Render headings WITHOUT a literal `### ` prefix on screen. The + // ANSI bold + green colour applied by marked-terminal is enough to + // distinguish a heading from prose; keeping `###` visible defeats + // the whole point of terminal markdown rendering (users see raw + // markdown markers and assume rendering is broken — exactly the + // bug report this fixes). + showSectionPrefix: false, + // Convert HTML entities back to characters + unescape: true, +}) as unknown as MarkedExtension; + +// ── Patch: process inline tokens inside `text`-block tokens ───────── +// marked-terminal v7.3.0's stock `text` renderer reads `token.text` +// (the raw markdown source) instead of `parser.parseInline(token.tokens)`. +// That's correct for *leaf* inline text tokens (no children) but wrong for +// the *block-level* `text` tokens that marked emits as the body of tight +// list items. Result: `* **bold** rest` in a list rendered as the literal +// `* **bold** rest` instead of `* bold rest`. See the README's KQL +// Expert example — that bug is what surfaced this. +// +// Fix: wrap the `text` renderer so that whenever the token has a +// non-empty `tokens` array, we recurse via `parseInline` to get proper +// inline formatting (strong/em/code/links/…). Leaf tokens fall through +// to the original behaviour so escape-handling is preserved. +// +// This is a marked-terminal upstream bug; revisit when that ships a fix. +const patchedRenderer = terminalExt as unknown as { + renderer: { + text: ( + this: { parser: { parseInline: (t: unknown[]) => string } }, + token: unknown, + ) => string; + }; +}; +const originalText = patchedRenderer.renderer.text; +patchedRenderer.renderer.text = function (token: unknown): string { + if ( + typeof token === "object" && + token !== null && + Array.isArray((token as { tokens?: unknown[] }).tokens) && + (token as { tokens: unknown[] }).tokens.length > 0 + ) { + return this.parser.parseInline((token as { tokens: unknown[] }).tokens); + } + return originalText.call(this, token); +}; + +const localMarked = new Marked(terminalExt); /** * Render a markdown string as ANSI-formatted terminal output. diff --git a/src/agent/skill-writer.ts b/src/agent/skill-writer.ts index 7137240..3369b8f 100644 --- a/src/agent/skill-writer.ts +++ b/src/agent/skill-writer.ts @@ -53,7 +53,7 @@ const VALID_NAME_RE = /^[a-z][a-z0-9-]*$/; /** * Names that double as `/skills` subcommands — accepting them as skill - * names would let `/skills ` shadow `/skills info|edit|delete|list` + * names would let `/skills ` shadow `/skills info|edit|delete|list|reload` * and create confusing CLI behaviour. */ const RESERVED_SKILL_NAMES: ReadonlySet = new Set([ @@ -61,6 +61,7 @@ const RESERVED_SKILL_NAMES: ReadonlySet = new Set([ "edit", "delete", "list", + "reload", ]); /** @@ -380,3 +381,24 @@ export function userSkillExists(name: string): boolean { if (nameError) return false; return existsSync(join(getUserSkillsDir(), name, "SKILL.md")); } + +/** + * Check whether a built-in (system) skill with the given name exists. + * + * Used by `generate_skill` to detect when the LLM is about to silently + * shadow a curated skill — see Bug 2 in [`docs/BACKLOG.md`]. System + * skills live under `/skills//SKILL.md`; the caller + * supplies the root so this module stays decoupled from agent paths. + * + * @param name - Skill identifier (validated to kebab-case). + * @param systemSkillsDir - Absolute path to the bundled skills root + * (typically `join(CONTENT_ROOT, "skills")`). + */ +export function systemSkillExists( + name: string, + systemSkillsDir: string, +): boolean { + const nameError = validateSkillName(name); + if (nameError) return false; + return existsSync(join(systemSkillsDir, name, "SKILL.md")); +} diff --git a/src/agent/slash-commands.ts b/src/agent/slash-commands.ts index 61e0ac0..a0f7c33 100644 --- a/src/agent/slash-commands.ts +++ b/src/agent/slash-commands.ts @@ -52,6 +52,7 @@ import { readUserSkill, deleteUserSkill, userSkillExists, + systemSkillExists, getUserSkillsDir, validateSkillName, } from "./skill-writer.js"; @@ -321,15 +322,44 @@ export async function handleSlashCommand( case "/markdown": case "/md": { - // Toggle markdown rendering — buffers output instead of streaming - // and renders through marked-terminal for proper formatting. - state.markdownEnabled = !state.markdownEnabled; - // System prompt includes markdown-specific instructions (OUTPUT mode, - // FILE REFERENCES). Rebuild the session so the LLM gets the update. - state.sessionNeedsRebuild = true; - console.log( - ` 📝 Markdown rendering: ${state.markdownEnabled ? C.ok("ON") + C.dim(" (output buffered, not streamed)") : C.err("OFF") + C.dim(" (raw streaming)")}`, - ); + // Subcommand dispatcher — bare invocation is a pure status query + // that NEVER mutates state. Mutation requires an explicit verb + // (on/off/toggle). This avoids the "I just wanted to check" + // toggle-trap that flips state at inspection time. + const sub = (parts[1] ?? "").toLowerCase(); + const prev = state.markdownEnabled; + let next = prev; + let unknown = false; + + if (sub === "" || sub === "status") { + // Pure status query — leave state untouched. + } else if (sub === "on" || sub === "true" || sub === "1") { + next = true; + } else if (sub === "off" || sub === "false" || sub === "0") { + next = false; + } else if (sub === "toggle") { + next = !prev; + } else { + unknown = true; + } + + if (unknown) { + console.log( + ` ${C.warn("⚠️")} Usage: ${C.tool("/markdown [on|off|toggle|status]")} ${C.dim("(bare = status, never mutates)")}`, + ); + } else { + if (next !== prev) { + state.markdownEnabled = next; + // System prompt includes markdown-specific instructions + // (OUTPUT mode, FILE REFERENCES). Rebuild so the LLM + // gets the update on the next turn. + state.sessionNeedsRebuild = true; + } + const detail = state.markdownEnabled + ? C.ok("ON") + C.dim(" (output buffered, not streamed)") + : C.err("OFF") + C.dim(" (raw streaming)"); + console.log(` 📝 Markdown rendering: ${detail}`); + } console.log(); return true; } @@ -881,7 +911,10 @@ export async function handleSlashCommand( .map(([k, v, isOvr]) => `| ${k} | ${v}${isOvr ? ovrTag : ""} |`) .join("\n"); const table = `| Setting | Value |\n|---------|-------|\n${mdRows}`; - console.log(` **⚙️ Configuration:**`); + // Use C.label() rather than literal markdown bold — console.log + // does not pass through the markdown renderer, so `**foo**` would + // print raw asterisks. + console.log(` ${C.label("⚙️ Configuration:")}`); console.log(renderMarkdown(table)); } else { console.log(` ${C.label("⚙️ Configuration:")}`); @@ -2241,11 +2274,47 @@ export async function handleSlashCommand( return true; } - // /skills — invoke a skill (delegates to SDK). + // /skills reload — hot-reload the SDK's skill registry so + // freshly authored skills (via `/save-skill`, `generate_skill`, + // or an external editor on `~/.hyperagent/skills//SKILL.md`) + // become invocable without restarting the agent. The SDK's + // `ensureSkillsLoaded()` is one-shot per session, so without + // this lever any mid-session skill writes are dead until the + // process recycles — exactly the user-facing footgun this + // fixes. Calls the experimental `session.rpc.skills.reload()` + // RPC which clears the loaded-skills cache, re-scans every + // configured `skillDirectories` entry, and re-emits the + // `` block to the model on its next turn. + if (sub === "reload") { + if (!state.activeSession) { + console.log( + ` ${C.err("❌ No active session — start the agent first.")}`, + ); + return true; + } + try { + await state.activeSession.rpc.skills.reload(); + console.log( + ` ${C.ok("📚 Skills reloaded")} ${C.dim("— new skills are now invocable.")}`, + ); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.log(` ${C.err("❌ Skill reload failed:")} ${msg}`); + } + return true; + } + + // /skills — bridge to "/" so the SDK's skill- + // invocation grammar matches (Bug 3). Previously this block + // printed "Invoking skill" and forwarded the raw "/skills " + // text to the SDK, which doesn't speak that grammar — the LLM + // saw it as natural language and sometimes mis-fired + // generate_skill. The REPL intake layer also rewrites this + // form, but we cover synthetic callers (--skill flag, suggested + // command auto-apply, /save-skill prompts) here as well. const skillArg = parts[1]?.trim(); if (skillArg && sub !== "list") { - console.log(` ${C.info("📚")} Invoking skill: ${C.tool(skillArg)}`); - return false; + return await handleSlashCommand(`/${skillArg}`, rl, deps); } // /skills (no args) or /skills list — list system + user skills, @@ -2313,7 +2382,7 @@ export async function handleSlashCommand( console.log(` ${C.dim(row.desc)}\n`); } console.log( - ` ${C.dim("Invoke: /skills · Manage: /skills info|edit|delete ")}`, + ` ${C.dim("Invoke: / · Manage: /skills info|edit|delete · Refresh: /skills reload")}`, ); } catch { console.log(" Error reading skills directory."); @@ -3055,15 +3124,30 @@ export async function handleSlashCommand( default: { // Check if it's a skill invocation — skills are handled by the SDK, // not by our slash command handler. Return false to pass through. + // + // System skills live under /skills/, user skills + // under getUserSkillsDir(). The SDK has both in skillDirectories + // so it can invoke either — we just need to recognise both here + // so the "Invoking skill" log fires and the input gets forwarded + // instead of reported as unknown. + // + // SECURITY: we MUST validate `skillName` before any fs operation — + // `cmd.slice(1)` is unsanitised user input and a literal + // `/../etc/passwd` would otherwise turn this into a filesystem + // probe via `join(skillsDir, skillName, ...)` (PR #151 review). + // `systemSkillExists` validates via `validateSkillName` first, + // rejecting empty / oversized / path-traversal / reserved names + // before touching disk. `userSkillExists` does the same. const __scDir = dirname(new URL(import.meta.url).pathname); const skillsDir = existsSync(join(__scDir, "skills")) ? join(__scDir, "skills") : resolve(__scDir, "../..", "skills"); try { - const { existsSync } = await import("node:fs"); const skillName = cmd.slice(1); // remove leading / - const skillPath = join(skillsDir, skillName, "SKILL.md"); - if (existsSync(skillPath)) { + if ( + systemSkillExists(skillName, skillsDir) || + userSkillExists(skillName) + ) { console.log(` ${C.info("📚")} Invoking skill: ${C.tool(skillName)}`); return false; // Let SDK handle it } diff --git a/src/agent/tool-gating.ts b/src/agent/tool-gating.ts index 1b08f42..e1b8c87 100644 --- a/src/agent/tool-gating.ts +++ b/src/agent/tool-gating.ts @@ -39,4 +39,11 @@ export const ALLOWED_TOOLS = new Set([ "mcp_server_info", // Detailed MCP server info + tool schemas "mcp_tool_info", // Focused MCP tool schema lookup "manage_mcp", // Connect/disconnect MCP servers + // ── SDK built-in tools we rely on ─────────────────────────────── + // The SDK exposes a "skill" tool when `skillDirectories` is set; the + // model calls it with `{ skill: "" }` to load the matching + // SKILL.md body into the conversation. Without this entry our + // onPreToolUse gate would block / invocations and the model + // sees only the raw "/kql-expert" string with no skill content. + "skill", ]); diff --git a/src/sandbox/tool.d.ts b/src/sandbox/tool.d.ts index f108e1d..4206b4e 100644 --- a/src/sandbox/tool.d.ts +++ b/src/sandbox/tool.d.ts @@ -64,6 +64,9 @@ export function parsePositiveInt( * @param {string|null} [options.timingLogPath] — Override HYPERAGENT_TIMING_LOG * @param {string|null} [options.codeLogPath] — Override HYPERAGENT_CODE_LOG * @param {boolean} [options.verbose] — Log lifecycle events (default: HYPERAGENT_DEBUG=1) + * @param {(msg: string) => void} [options.debugLog] — Sink for verbose lifecycle traces. + * When supplied, verbose output routes here (typically a debug-log file) instead of + * stderr — keeps `[sandbox]` lifecycle chatter out of the user-facing terminal. * @returns {SandboxTool} */ export function createSandboxTool(options?: { @@ -76,6 +79,7 @@ export function createSandboxTool(options?: { timingLogPath?: string | null | undefined; codeLogPath?: string | null | undefined; verbose?: boolean | undefined; + debugLog?: ((msg: string) => void) | undefined; }): SandboxTool; export type SandboxToolConfig = { /** diff --git a/src/sandbox/tool.js b/src/sandbox/tool.js index 45c0884..3d8164e 100644 --- a/src/sandbox/tool.js +++ b/src/sandbox/tool.js @@ -154,6 +154,9 @@ export function parsePositiveInt(raw, defaultVal) { * @param {string|null} [options.timingLogPath] — Override HYPERAGENT_TIMING_LOG * @param {string|null} [options.codeLogPath] — Override HYPERAGENT_CODE_LOG * @param {boolean} [options.verbose] — Log lifecycle events (default: HYPERAGENT_DEBUG=1) + * @param {(msg: string) => void} [options.debugLog] — Sink for verbose lifecycle traces. + * When supplied, verbose output routes here (typically a debug-log file) instead of + * stderr — keeps `[sandbox]` lifecycle chatter out of the user-facing terminal. * @returns {SandboxTool} */ export function createSandboxTool(options = {}) { @@ -215,6 +218,21 @@ export function createSandboxTool(options = {}) { /** Whether to log lifecycle messages (init, shutdown, etc.). */ const verbose = options.verbose ?? process.env.HYPERAGENT_DEBUG === "1"; + /** + * Sink for verbose lifecycle traces. Default: console.error + * (preserves standalone-usage behaviour, e.g. tests). When the host + * wires up a `debugLog` callback, verbose output routes there instead + * of mixing with the user-visible terminal stream. Either way the + * call sites still gate on `verbose` first so messages only fire + * when debugging is enabled. + * + * @param {string} msg + */ + const verboseLog = + typeof options.debugLog === "function" + ? options.debugLog + : (msg) => console.error(msg); + // ── Plugin Registrations ───────────────────────────────────── // // Plugins register host functions on the proto BEFORE loadRuntime(). @@ -393,7 +411,7 @@ export function createSandboxTool(options = {}) { async function autoSaveState() { if (!loadedSandbox) { if (verbose) { - console.error("[sandbox] autoSaveState: SKIP - no loadedSandbox"); + verboseLog("[sandbox] autoSaveState: SKIP - no loadedSandbox"); } return; } @@ -401,7 +419,7 @@ export function createSandboxTool(options = {}) { // Saving would overwrite the good stash from before the poison event. if (loadedSandbox.poisoned) { if (verbose) { - console.error( + verboseLog( "[sandbox] autoSaveState: SKIP - sandbox is POISONED, preserving existing stash", ); } @@ -411,7 +429,7 @@ export function createSandboxTool(options = {}) { // The guest has old/empty state; saving would overwrite the good stash. if (guestStateStale) { if (verbose) { - console.error( + verboseLog( "[sandbox] autoSaveState: SKIP - guest state is STALE (post-crash-recovery), preserving existing stash", ); } @@ -419,7 +437,7 @@ export function createSandboxTool(options = {}) { } if (!handlerCache.has("_save_state")) { if (verbose) { - console.error( + verboseLog( "[sandbox] autoSaveState: SKIP - _save_state handler not in cache. " + `handlerCache keys: [${[...handlerCache.keys()].join(", ")}]`, ); @@ -428,7 +446,7 @@ export function createSandboxTool(options = {}) { } try { if (verbose) { - console.error( + verboseLog( "[sandbox] autoSaveState: CALLING _save_state handler...", ); } @@ -443,7 +461,7 @@ export function createSandboxTool(options = {}) { ); if (verbose) { const stashSize = savedSharedState ? savedSharedState.size : 0; - console.error( + verboseLog( `[sandbox] autoSaveState: SUCCESS - result=${JSON.stringify(result)}, stash size=${stashSize} keys`, ); } @@ -451,7 +469,7 @@ export function createSandboxTool(options = {}) { // Only log failure in verbose mode — these are expected during // CPU timeouts and shouldn't clutter the UI. if (verbose) { - console.error( + verboseLog( `[sandbox] ⚠️ Failed to save shared-state: ${err.message ?? err}`, ); } @@ -472,13 +490,13 @@ export function createSandboxTool(options = {}) { async function autoRestoreState() { if (!loadedSandbox) { if (verbose) { - console.error("[sandbox] autoRestoreState: SKIP - no loadedSandbox"); + verboseLog("[sandbox] autoRestoreState: SKIP - no loadedSandbox"); } return false; } if (savedSharedState === null) { if (verbose) { - console.error( + verboseLog( "[sandbox] autoRestoreState: SKIP - savedSharedState is null", ); } @@ -486,7 +504,7 @@ export function createSandboxTool(options = {}) { } if (savedSharedState.size === 0) { if (verbose) { - console.error( + verboseLog( "[sandbox] autoRestoreState: SKIP - savedSharedState is empty", ); } @@ -494,7 +512,7 @@ export function createSandboxTool(options = {}) { } if (!handlerCache.has("_restore_state")) { if (verbose) { - console.error( + verboseLog( "[sandbox] autoRestoreState: SKIP - _restore_state handler not in cache. " + `handlerCache keys: [${[...handlerCache.keys()].join(", ")}]`, ); @@ -534,7 +552,7 @@ export function createSandboxTool(options = {}) { try { if (verbose) { - console.error( + verboseLog( `[sandbox] autoRestoreState: CALLING _restore_state handler... (stash has ${savedSharedState.size} keys: [${[...savedSharedState.keys()].join(", ")}])`, ); } @@ -548,7 +566,7 @@ export function createSandboxTool(options = {}) { }, ); if (verbose) { - console.error( + verboseLog( `[sandbox] autoRestoreState: SUCCESS - result=${JSON.stringify(result)}`, ); } @@ -569,7 +587,7 @@ export function createSandboxTool(options = {}) { // Clear stale flag so we don't keep trying every execution guestStateStale = false; } else if (verbose) { - console.error(`[sandbox] ⚠️ Failed to restore shared-state: ${msg}`); + verboseLog(`[sandbox] ⚠️ Failed to restore shared-state: ${msg}`); } return false; @@ -737,13 +755,13 @@ export function createSandboxTool(options = {}) { async function invalidateSandboxWithSave() { if (loadedSandbox !== null) { if (verbose) { - console.error( + verboseLog( "[sandbox] invalidateSandboxWithSave: loadedSandbox exists, saving state...", ); } await autoSaveState(); } else if (verbose) { - console.error( + verboseLog( "[sandbox] invalidateSandboxWithSave: loadedSandbox is null, nothing to save", ); } @@ -938,7 +956,7 @@ export function createSandboxTool(options = {}) { } if (verbose) { - console.error( + verboseLog( `[hyperlight] Plugin "${plugin.name}" registered modules: ${registeredModules.join(", ")}`, ); } @@ -956,7 +974,7 @@ export function createSandboxTool(options = {}) { jsSandbox = await proto.loadRuntime(); if (verbose) { - console.error("[hyperlight] Sandbox initialized"); + verboseLog("[hyperlight] Sandbox initialized"); } } @@ -1035,7 +1053,7 @@ export function createSandboxTool(options = {}) { // The next execute will recompile all handlers. if (loadedSandbox !== null) { if (verbose) { - console.error( + verboseLog( `[sandbox] registerHandler("${name}"): loadedSandbox exists, calling autoSaveState before invalidating...`, ); } @@ -1051,13 +1069,13 @@ export function createSandboxTool(options = {}) { currentSnapshot = null; if (verbose) { const stashSize = savedSharedState ? savedSharedState.size : 0; - console.error( + verboseLog( `[sandbox] registerHandler("${name}"): sandbox invalidated, stash preserved with ${stashSize} keys`, ); } } else { if (verbose) { - console.error( + verboseLog( `[sandbox] registerHandler("${name}"): loadedSandbox is null, no state to save`, ); } @@ -1324,7 +1342,7 @@ export function createSandboxTool(options = {}) { // Invalidate compiled state (same as registerHandler) if (loadedSandbox !== null) { if (verbose) { - console.error( + verboseLog( `[sandbox] editHandler("${name}"): loadedSandbox exists, calling autoSaveState before invalidating...`, ); } @@ -1339,7 +1357,7 @@ export function createSandboxTool(options = {}) { currentSnapshot = null; if (verbose) { const stashSize = savedSharedState ? savedSharedState.size : 0; - console.error( + verboseLog( `[sandbox] editHandler("${name}"): sandbox invalidated, stash preserved with ${stashSize} keys`, ); } @@ -1473,7 +1491,7 @@ export function createSandboxTool(options = {}) { if (loadedSandbox !== null) { if (verbose) { - console.error( + verboseLog( `[sandbox] editHandlerLines("${name}"): loadedSandbox exists, calling autoSaveState before invalidating...`, ); } @@ -1488,7 +1506,7 @@ export function createSandboxTool(options = {}) { currentSnapshot = null; if (verbose) { const stashSize = savedSharedState ? savedSharedState.size : 0; - console.error( + verboseLog( `[sandbox] editHandlerLines("${name}"): sandbox invalidated, stash preserved with ${stashSize} keys`, ); } @@ -1767,7 +1785,7 @@ export function createSandboxTool(options = {}) { savedSharedState.size > 0 ) { if (verbose) { - console.error( + verboseLog( "[sandbox] executeJavaScript: guest state is STALE, restoring from stash before execution...", ); } @@ -1780,7 +1798,7 @@ export function createSandboxTool(options = {}) { } guestStateStale = false; if (verbose) { - console.error( + verboseLog( "[sandbox] executeJavaScript: stash restored, guestStateStale cleared", ); } @@ -1791,7 +1809,7 @@ export function createSandboxTool(options = {}) { // This happens when a crash occurs before any state was saved. guestStateStale = false; if (verbose) { - console.error( + verboseLog( "[sandbox] executeJavaScript: guestStateStale cleared (no stash to restore)", ); } @@ -1968,7 +1986,7 @@ export function createSandboxTool(options = {}) { // Auto-restore shared state after recompile if we have a stash. if (verbose) { - console.error( + verboseLog( `[sandbox] executeJavaScript: after recompile, checking for state restore... ` + `savedSharedState=${savedSharedState !== null ? `Map(${savedSharedState.size})` : "null"}`, ); @@ -1984,12 +2002,12 @@ export function createSandboxTool(options = {}) { // Guest state now matches host stash — clear stale flag. guestStateStale = false; if (verbose) { - console.error( + verboseLog( "[sandbox] executeJavaScript: statePreserved=true after restore, guestStateStale cleared", ); } } else if (verbose) { - console.error( + verboseLog( "[sandbox] executeJavaScript: NO state restore (savedSharedState=" + (savedSharedState === null ? "null" @@ -2101,7 +2119,7 @@ export function createSandboxTool(options = {}) { // successful execution. Don't let autoSaveState overwrite it. guestStateStale = true; if (verbose) { - console.error( + verboseLog( "[sandbox] crash recovery: restored from snapshot, marked guest state as STALE", ); } @@ -2218,7 +2236,7 @@ export function createSandboxTool(options = {}) { async function setPlugins(plugins) { if (verbose) { const stashBefore = savedSharedState ? savedSharedState.size : 0; - console.error( + verboseLog( `[sandbox] setPlugins: called with ${plugins.length} plugins, stash has ${stashBefore} keys before invalidate`, ); } @@ -2229,7 +2247,7 @@ export function createSandboxTool(options = {}) { await invalidateSandboxWithSave(); if (verbose) { const stashAfter = savedSharedState ? savedSharedState.size : 0; - console.error( + verboseLog( `[sandbox] setPlugins: after invalidate, stash has ${stashAfter} keys`, ); } diff --git a/tests/approach-resolver.test.ts b/tests/approach-resolver.test.ts index 9f5cb1f..57d2e3c 100644 --- a/tests/approach-resolver.test.ts +++ b/tests/approach-resolver.test.ts @@ -261,15 +261,61 @@ describe("formatGuidance — MCP Servers section", () => { }; } - it("should show ❌ for unconfigured MCP server", () => { + it("should surface supported unconfigured MCP server as a top-level prerequisite", () => { + // fabric-rti-mcp is in MCP_SETUP_COMMANDS — guidance must recommend + // the specific `--mcp-setup-fabric-rti` shortcut, not a generic flag. const status: MCPServerStatus = { name: "fabric-rti-mcp", configured: false, }; const output = formatGuidance(makeGuidance({ mcpStatus: [status] })); - expect(output).toContain("MCP Servers:"); - expect(output).toContain("❌ fabric-rti-mcp — not configured"); + expect(output).toContain("🛑 MISSING PREREQUISITES"); + expect(output).toContain( + 'MCP server "fabric-rti-mcp" is required but not configured', + ); expect(output).toContain("hyperagent --mcp-setup-fabric-rti"); + // Must NOT leak into the regular "MCP Servers:" status block — + // unconfigured servers are surfaced as prerequisites, not status. + expect(output).not.toContain("MCP Servers:\n ❌"); + }); + + it("should give honest generic guidance for an unsupported MCP server", () => { + // "made-up-mcp" is NOT in MCP_SETUP_COMMANDS — the guidance MUST NOT + // hallucinate a `--mcp-setup-made-up-mcp` flag (it would not exist + // and would fail at the CLI parser). Honest fallback: point at + // ~/.hyperagent/config.json. + const status: MCPServerStatus = { + name: "made-up-mcp", + configured: false, + }; + const output = formatGuidance(makeGuidance({ mcpStatus: [status] })); + expect(output).toContain("🛑 MISSING PREREQUISITES"); + expect(output).toContain( + 'MCP server "made-up-mcp" is required but not configured', + ); + expect(output).toContain("~/.hyperagent/config.json"); + expect(output).not.toContain("--mcp-setup-made-up-mcp"); + }); + + it("missing prerequisites block precedes Rules/Modules", () => { + // Ordering matters — buried prerequisites get ignored. + const status: MCPServerStatus = { + name: "fabric-rti-mcp", + configured: false, + }; + const output = formatGuidance( + makeGuidance({ + mcpStatus: [status], + rules: ["always pre-filter with where"], + modules: ["doc-core"], + }), + ); + const blockerIdx = output.indexOf("🛑 MISSING PREREQUISITES"); + const rulesIdx = output.indexOf("Rules:"); + const modulesIdx = output.indexOf("Modules:"); + expect(blockerIdx).toBeGreaterThanOrEqual(0); + expect(blockerIdx).toBeLessThan(rulesIdx); + expect(blockerIdx).toBeLessThan(modulesIdx); }); it("should show ✅ for connected MCP server", () => { @@ -310,5 +356,6 @@ describe("formatGuidance — MCP Servers section", () => { it("should omit MCP section when mcpStatus is empty", () => { const output = formatGuidance(makeGuidance()); expect(output).not.toContain("MCP Servers:"); + expect(output).not.toContain("🛑 MISSING PREREQUISITES"); }); }); diff --git a/tests/markdown-renderer.test.ts b/tests/markdown-renderer.test.ts index f6e5216..8258b90 100644 --- a/tests/markdown-renderer.test.ts +++ b/tests/markdown-renderer.test.ts @@ -57,6 +57,43 @@ describe("renderMarkdown", () => { expect(out).toContain("const x = 1"); }); + it("strips inline markdown markers from list items (regression: marked-terminal v7 + marked v15)", () => { + // Regression: marked-terminal's stock `text` renderer reads the + // raw `token.text` instead of recursing into inline tokens. For + // tight list items (where the body is a `text`-type block token, + // not a `paragraph`), this leaks `**bold**` / `*em*` / `` `code` `` + // markers verbatim to the terminal. The user-visible symptom is + // skill greetings, /help output, and any LLM response containing + // a bulleted list rendering with literal asterisks on screen + // despite the user opting in to markdown. + // + // We patch the `text` renderer in markdown-renderer.ts; this test + // anchors the fix so it can't silently regress. Asserting on the + // ABSENCE of the markdown markers is the only check that catches + // a no-op renderer (the earlier "contains 'bold'" assertion did + // not — the raw `**bold**` also contains the substring "bold"). + const md = [ + "* **bold item** rest of line", + "* normal item", + "", + "1. **numbered bold** with `inline code`", + "2. plain", + ].join("\n"); + const out = renderMarkdown(md); + + // Inline formatters must have consumed their markers. + expect(out).not.toContain("**bold item**"); + expect(out).not.toContain("**numbered bold**"); + expect(out).not.toContain("`inline code`"); + + // The semantic content survives. + expect(out).toContain("bold item"); + expect(out).toContain("normal item"); + expect(out).toContain("numbered bold"); + expect(out).toContain("inline code"); + expect(out).toContain("plain"); + }); + it("trims trailing newlines", () => { const out = renderMarkdown("hello\n\n"); expect(out.endsWith("\n")).toBe(false); @@ -65,6 +102,61 @@ describe("renderMarkdown", () => { it("handles empty input", () => { expect(renderMarkdown("")).toBe(""); }); + + it("renders headings without a literal ### prefix (showSectionPrefix=false)", () => { + // Regression: marked-terminal defaults `showSectionPrefix` to true, + // which keeps the literal `### ` markers in front of every heading + // ("### What I can do:") and makes users assume markdown rendering + // is broken even when bold/colour ANSI is correctly applied. We + // pin the option to `false` in markdown-renderer.ts; if a future + // edit flips it back, this test catches it. + const md = ["# top", "", "## sub", "", "### deep", ""].join("\n"); + const out = renderMarkdown(md); + + // The heading TEXT must survive… + expect(out).toContain("top"); + expect(out).toContain("sub"); + expect(out).toContain("deep"); + + // …but the markdown markers themselves must be gone. + expect(out).not.toContain("# top"); + expect(out).not.toContain("## sub"); + expect(out).not.toContain("### deep"); + }); + + it("renders GFM tables with header and body cells", () => { + // Regression: the profile-apply preview emits a markdown table so + // the limit before/after grid is readable. marked-terminal's + // table renderer must produce output that contains all header and + // body cell text — if a future marked-terminal upgrade silently + // breaks the table extension, this anchors the fix. + const md = [ + "| Limit | Before | After |", + "|---|---|---|", + "| cpu | 1000ms | 2000ms |", + "| heap | 16MB | 32MB |", + ].join("\n"); + const out = renderMarkdown(md); + + // Header cells survive + expect(out).toContain("Limit"); + expect(out).toContain("Before"); + expect(out).toContain("After"); + + // Body cells survive with their units + expect(out).toContain("cpu"); + expect(out).toContain("1000ms"); + expect(out).toContain("2000ms"); + expect(out).toContain("heap"); + expect(out).toContain("16MB"); + expect(out).toContain("32MB"); + + // The raw pipe-delimited header row must NOT leak through — + // marked-terminal should have converted it into a box-drawing + // table. If we see "| Limit | Before |" verbatim, the table + // extension is broken. + expect(out).not.toContain("| Limit | Before | After |"); + }); }); describe("looksLikeMarkdown", () => { diff --git a/tests/pattern-integrity.test.ts b/tests/pattern-integrity.test.ts index 8fb9fb5..15d15da 100644 --- a/tests/pattern-integrity.test.ts +++ b/tests/pattern-integrity.test.ts @@ -182,5 +182,333 @@ describe("pattern-integrity", () => { ).toBe(true); }); } + + // ── SDK built-in tools we rely on ────────────────────────────── + // Some tools (like the "skill" tool that materialises / + // invocations into the conversation) are owned by the Copilot SDK, + // not defined via defineTool(). They still need to appear in BOTH + // ALLOWED_TOOLS and availableTools — otherwise the SDK reports + // "Disabled tools: ... skill ..." in session.info and the model + // can never call them. This caught a real regression where typing + // /kql-expert loaded the skill metadata but never injected the + // SKILL.md body, because the SDK's "skill" tool was gated out. + const SDK_BUILTIN_TOOLS = ["skill"]; + for (const toolName of SDK_BUILTIN_TOOLS) { + it(`SDK built-in "${toolName}" is in ALLOWED_TOOLS`, () => { + expect( + ALLOWED_TOOLS.has(toolName), + `SDK built-in tool "${toolName}" is missing from ALLOWED_TOOLS. ` + + `onPreToolUse will reject it. Add it to src/agent/tool-gating.ts.`, + ).toBe(true); + }); + + it(`SDK built-in "${toolName}" is in availableTools`, () => { + const availableToolsMatch = indexSource.match( + /availableTools:\s*\[([\s\S]*?)\]/, + ); + expect( + availableToolsMatch, + "Could not find availableTools array in index.ts", + ).toBeTruthy(); + const availableList = availableToolsMatch![1]; + expect( + availableList.includes(`"${toolName}"`), + `SDK built-in tool "${toolName}" is missing from availableTools. ` + + `The SDK will disable it (visible as "Disabled tools: ... ${toolName} ..." ` + + `in session.info) and the model cannot invoke it. ` + + `Add "${toolName}" to the availableTools array in buildSessionConfig().`, + ).toBe(true); + }); + } + }); + + describe("skill hot-reload wiring", () => { + // These tests guard the "no process restart needed" fix. + // + // The SDK's `ensureSkillsLoaded()` is one-shot per session, so any + // skill written mid-session (via `/save-skill`, the `generate_skill` + // tool, or `$EDITOR` on `~/.hyperagent/skills//SKILL.md`) is + // invisible to the model until the next process start. We close + // the gap by: + // 1. Auto-calling `session.rpc.skills.reload()` immediately after + // `generate_skill` writes a SKILL.md. + // 2. Exposing a manual `/skills reload` subcommand for external + // editor workflows. + // + // Regressing either of these silently puts us back to "user has to + // restart the agent to invoke a skill they just wrote" — exactly + // the footgun the fix removed. Keep both assertions. + const indexSource = readFileSync( + join(ROOT, "src", "agent", "index.ts"), + "utf-8", + ); + const slashSource = readFileSync( + join(ROOT, "src", "agent", "slash-commands.ts"), + "utf-8", + ); + + it("generate_skill tool calls skills.reload() after writeUserSkill", () => { + // Anchor on the exact API path we depend on — a refactor that + // accidentally drops the reload call would otherwise sneak by. + expect( + indexSource.includes("activeSession.rpc.skills.reload()"), + "generate_skill must call `state.activeSession.rpc.skills.reload()` " + + "after `writeUserSkill()` so the new skill is invocable on the next " + + "turn. Without it, the user has to restart the agent — see " + + "src/agent/index.ts generateSkillTool.", + ).toBe(true); + }); + + it("/skills reload subcommand exists and calls skills.reload()", () => { + // Two-step check: the subcommand handler and the RPC call must + // both be present in the /skills case. + expect( + slashSource.includes('sub === "reload"'), + "Slash-command handler for `/skills reload` is missing. " + + "Without it, users editing skills in $EDITOR have no way to " + + "refresh the SDK's skill registry short of restarting the agent.", + ).toBe(true); + expect( + slashSource.includes("activeSession.rpc.skills.reload()"), + "`/skills reload` subcommand must call " + + "`state.activeSession.rpc.skills.reload()` — it's the public " + + "SDK API that clears the skill cache and re-scans skillDirectories.", + ).toBe(true); + }); + + it("REPL `/skills ` rewrite gates on validateSkillName, not a hardcoded set", () => { + // PR #151 review caught a regression: the rewrite used a + // `KNOWN_SKILLS_SUBS = new Set(["info","edit","delete","list"])` + // local set that omitted `reload`, so typing `/skills reload` + // got rewritten to `/reload` and broke the hot-reload command. + // The fix is to delegate to `validateSkillName()` which uses + // the canonical `RESERVED_SKILL_NAMES` set in skill-writer.ts + // (single source of truth — adding a new subcommand only requires + // updating one place). This test pins the new mechanism so a + // future "quick fix" doesn't reintroduce a parallel hardcoded + // list that can drift again. + expect( + indexSource.includes( + "validateSkillName(skillsParts[1].toLowerCase()) === null", + ), + "The /skills rewrite must gate on `validateSkillName(...) === null` " + + "so reserved subcommands (info|edit|delete|list|reload) and any " + + "future additions are automatically excluded via RESERVED_SKILL_NAMES.", + ).toBe(true); + expect( + !indexSource.includes("KNOWN_SKILLS_SUBS"), + "The hardcoded `KNOWN_SKILLS_SUBS` set was the source of the " + + "`/skills reload` regression — it must stay deleted. Use " + + "validateSkillName() / RESERVED_SKILL_NAMES instead.", + ).toBe(true); + }); + }); + + describe("slash-command skill detection (no path traversal)", () => { + // PR #151 review found that the default-case skill-detection used + // `existsSync(join(skillsDir, skillName, "SKILL.md"))` with `skillName` + // taken directly from `cmd.slice(1)` — unsanitised user input. + // A literal `/../etc` would resolve outside `skillsDir`, turning + // the "is this a skill?" check into an arbitrary filesystem probe. + // + // Fix: route the system-skill side of the check through + // `systemSkillExists()` which calls `validateSkillName()` first — + // rejecting empty / oversized / kebab-case-violating / path-traversal + // / reserved names BEFORE any `join()` touches disk. + const slashSource = readFileSync( + join(ROOT, "src", "agent", "slash-commands.ts"), + "utf-8", + ); + + it("default case uses systemSkillExists, not a raw existsSync(join(skillsDir, ...))", () => { + // The validated helper is the canonical entry-point. + expect( + slashSource.includes("systemSkillExists(skillName, skillsDir)"), + "Slash-command default case must call " + + "`systemSkillExists(skillName, skillsDir)` — it validates the " + + "name before any path join, preventing /../etc-style traversal.", + ).toBe(true); + + // The raw pattern must NOT come back in the same file. Grep on + // the exact arg shape (`skillName, "SKILL.md"`) so unrelated + // existsSync calls elsewhere in slash-commands.ts (there are + // several legitimate ones) don't trip this guard. + expect( + slashSource.includes( + 'existsSync(join(skillsDir, skillName, "SKILL.md"))', + ), + 'Found a raw `existsSync(join(skillsDir, skillName, "SKILL.md"))` ' + + "in slash-commands.ts — this is the exact unsafe pattern PR #151 " + + "review flagged. Replace it with `systemSkillExists()`.", + ).toBe(false); + }); + }); + + describe("markdown UX (no toggle-trap, no raw `**` rendering)", () => { + // These tests guard against three previously-shipped UX bugs: + // + // Bug 1 — `/markdown` was a destructive toggle. Users running it + // to inspect state inadvertently flipped state, then reported + // "markdown is OFF when it should be ON". Fix: bare invocation + // is a pure status query; mutation requires `on|off|toggle`. + // + // Bug 2 — Two callsites printed literal `**Configuration:**` via + // plain `console.log`, so the asterisks ended up on screen + // instead of bold formatting. Fix: use `C.label()` / ANSI bold. + // + // Bug 3 — `processMessage()` gated the rendered output of the + // *assistant's own response* on `looksLikeMarkdown()`, producing + // inconsistent results (some turns rendered, others raw) for users + // who had explicitly opted in. Fix: when `markdownEnabled` is + // true the assistant response always goes through `renderMarkdown`. + const indexSource = readFileSync( + join(ROOT, "src", "agent", "index.ts"), + "utf-8", + ); + const slashSource = readFileSync( + join(ROOT, "src", "agent", "slash-commands.ts"), + "utf-8", + ); + + it("/markdown supports explicit subcommands (on/off/toggle/status)", () => { + // Anchor on each subcommand literal — a refactor that drops one + // would silently regress the "inspect without mutating" contract. + for (const verb of ['"on"', '"off"', '"toggle"', '"status"']) { + expect( + slashSource.includes(`sub === ${verb}`), + `/markdown handler must accept the ${verb} subcommand. ` + + `Bare invocation is the status query; on/off/toggle are the ` + + `explicit mutations. Dropping any of these reverts the ` + + `"toggle-trap" UX bug — see src/agent/slash-commands.ts.`, + ).toBe(true); + } + }); + + it("no callsite prints literal `**Configuration:**` via console.log", () => { + // Plain console.log does NOT pass strings through the markdown + // renderer, so `**foo**` would print raw asterisks. Callers must + // use C.label() (ANSI bold helper) or renderMarkdown() instead. + const bannerRe = + /console\.(?:log|error)\([^)]*\*\*[^)]*Configuration[^)]*\*\*/; + expect( + bannerRe.test(indexSource), + "src/agent/index.ts startup banner prints literal `**Configuration:**`. " + + "Use `${bold}Configuration:${reset}` (ANSI) so users opted-in to " + + "markdown don't see raw asterisks above the rendered table.", + ).toBe(false); + expect( + bannerRe.test(slashSource), + "src/agent/slash-commands.ts /config handler prints literal " + + '`**Configuration:**`. Use `C.label("⚙️ Configuration:")` to match ' + + "the non-markdown branch and avoid raw asterisks on screen.", + ).toBe(false); + }); + + it("processMessage renders assistant content unconditionally when markdown is on", () => { + // The pre-fix code wrapped the renderMarkdown call in a + // `looksLikeMarkdown(state.streamedText)` gate, so turns whose + // text didn't trip the heuristic landed on screen raw — even + // though the user explicitly opted in via /markdown. The fix is + // to drop that inner gate for the streamed-text branch. + // + // We anchor on a comment phrase that lives next to the fix. The + // assertion is purposely loose (regex-free) so legitimate edits + // to the comment don't break the test — but a regression that + // re-introduces the inner gate would also have to re-introduce + // the gating call, which is what the second assertion checks. + expect( + indexSource.includes("user has\n // explicitly opted in") || + indexSource.includes("explicitly opted in via the default-on flag"), + "processMessage() must document why it renders unconditionally " + + "when state.markdownEnabled is true — the comment is the only " + + "thing keeping a future refactor from re-adding a looksLikeMarkdown " + + "gate that would re-introduce the inconsistent-rendering bug.", + ).toBe(true); + // Hard structural check: the streamedText branch must NOT gate on + // looksLikeMarkdown. We grep for the previous pattern; if it + // returns, the regression is back. + expect( + indexSource.includes("if (looksLikeMarkdown(state.streamedText))"), + "processMessage() reintroduced a `looksLikeMarkdown(state.streamedText)` " + + "gate on the buffered assistant output. When the user has opted in to " + + "markdown rendering, that gate produces inconsistent output (some " + + "turns rendered, others raw). Render unconditionally.", + ).toBe(false); + }); + }); + + describe("MCP setup shortcut surfacing", () => { + // Background: when a skill declares `requires-mcp: ` and the + // server isn't yet configured, the LLM is supposed to tell the user + // about the specific `--mcp-setup-` shortcut. In practice + // the model has been observed to (a) hallucinate generic flags that + // don't exist (e.g. `--mcp foo=bar`) or (b) bury the recommendation + // under config.json snippets. formatGuidance() is what feeds the + // model — if we hold its output to a strict shape, the model's + // surfaced advice stays accurate. + + const resolverSource = readFileSync( + join(ROOT, "src", "agent", "approach-resolver.ts"), + "utf-8", + ); + const cliSource = readFileSync( + join(ROOT, "src", "agent", "cli-parser.ts"), + "utf-8", + ); + + // The names in MCP_SETUP_COMMANDS must correspond to real CLI flags. + // If you add a new shortcut, this list must grow in lockstep. + const SUPPORTED_SHORTCUTS = [ + ["fabric-rti-mcp", "--mcp-setup-fabric-rti"], + ["everything", "--mcp-setup-everything"], + ["github", "--mcp-setup-github"], + ["filesystem", "--mcp-setup-filesystem"], + ["workiq", "--mcp-setup-workiq"], + ] as const; + + it("MCP_SETUP_COMMANDS lists every supported server", () => { + for (const [name, flag] of SUPPORTED_SHORTCUTS) { + // Match a key-value line like `name: "--mcp-setup-...",` or + // `"fabric-rti-mcp": "--mcp-setup-fabric-rti",` — quoting on + // bare keys is optional in TS object literals. + const keyRe = new RegExp( + `["']?${name.replace(/[.*+?^${}()|[\\]\\\\]/g, "\\\\$&")}["']?\\s*:\\s*["']${flag}["']`, + ); + expect( + keyRe.test(resolverSource), + `MCP_SETUP_COMMANDS must map "${name}" → "${flag}" so ` + + `formatGuidance() can recommend the specific shortcut. ` + + "See src/agent/approach-resolver.ts.", + ).toBe(true); + } + }); + + it("every MCP_SETUP_COMMANDS flag has a matching CLI case", () => { + // Belt-and-braces: a shortcut listed in the resolver but missing + // from the parser would have us recommend a flag that prints + // "unknown option" — exactly the failure mode we're guarding. + for (const [, flag] of SUPPORTED_SHORTCUTS) { + expect( + cliSource.includes(`case "${flag}"`), + `cli-parser.ts must handle "${flag}" — MCP_SETUP_COMMANDS in ` + + "approach-resolver.ts references it. Either remove the " + + "mapping or wire up the CLI case.", + ).toBe(true); + } + }); + + it("formatGuidance does NOT synthesise a fake `--mcp-setup-${name}` flag", () => { + // The previous fallback was `--mcp-setup-${s.name}` — that would + // emit, e.g., `--mcp-setup-made-up-mcp` for unsupported servers, + // which the CLI parser would reject. Honest guidance must point + // users at the config file instead. + expect( + resolverSource.includes("`--mcp-setup-${s.name}`"), + "formatGuidance() reintroduced a synthesised " + + "`--mcp-setup-${name}` fallback. That flag does not exist for " + + "arbitrary names; recommend ~/.hyperagent/config.json instead. " + + "See src/agent/approach-resolver.ts.", + ).toBe(false); + }); }); }); diff --git a/tests/skill-writer.test.ts b/tests/skill-writer.test.ts index 6cf11ef..387139e 100644 --- a/tests/skill-writer.test.ts +++ b/tests/skill-writer.test.ts @@ -86,11 +86,12 @@ describe("validateSkillName", () => { }); it("rejects reserved /skills subcommand names", () => { - // These would shadow `/skills info|edit|delete|list` if accepted. + // These would shadow `/skills info|edit|delete|list|reload` if accepted. expect(writer.validateSkillName("info")).toMatch(/reserved/i); expect(writer.validateSkillName("edit")).toMatch(/reserved/i); expect(writer.validateSkillName("delete")).toMatch(/reserved/i); expect(writer.validateSkillName("list")).toMatch(/reserved/i); + expect(writer.validateSkillName("reload")).toMatch(/reserved/i); }); }); @@ -364,3 +365,45 @@ describe("writeUserSkill / listUserSkills / readUserSkill / deleteUserSkill", () ).toThrow(/exceeds maximum size/); }); }); + +// ── systemSkillExists ──────────────────────────────────────────────── + +describe("systemSkillExists", () => { + it("returns true when //SKILL.md exists", () => { + const sysDir = mkdtempSync(join(tmpdir(), "skill-writer-sys-")); + try { + const skillDir = join(sysDir, "kql-expert"); + mkdirSync(skillDir, { recursive: true }); + writeFileSync(join(skillDir, "SKILL.md"), "stub", "utf-8"); + expect(writer.systemSkillExists("kql-expert", sysDir)).toBe(true); + } finally { + rmSync(sysDir, { recursive: true, force: true }); + } + }); + + it("returns false when the skill directory has no SKILL.md", () => { + const sysDir = mkdtempSync(join(tmpdir(), "skill-writer-sys-")); + try { + mkdirSync(join(sysDir, "kql-expert"), { recursive: true }); + // Empty dir — no SKILL.md + expect(writer.systemSkillExists("kql-expert", sysDir)).toBe(false); + } finally { + rmSync(sysDir, { recursive: true, force: true }); + } + }); + + it("returns false when the systemSkillsDir itself is missing", () => { + expect( + writer.systemSkillExists( + "anything", + join(tmpdir(), "definitely-not-here-" + Date.now()), + ), + ).toBe(false); + }); + + it("returns false for an invalid skill name (no path traversal)", () => { + // Validation runs first so a malicious name can't escape the dir. + expect(writer.systemSkillExists("../etc", "/usr")).toBe(false); + expect(writer.systemSkillExists("Has Spaces", "/usr")).toBe(false); + }); +});