Skip to content

Commit 5c803b8

Browse files
author
heyang.why
committed
fix(mcp): auto-reconnect on transport errors (#25287)
1 parent 9f708e7 commit 5c803b8

11 files changed

Lines changed: 951 additions & 33 deletions

.github/workflows/test.yml

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,34 @@ jobs:
8888
retention-days: 7
8989
path: packages/*/.artifacts/unit/junit.xml
9090

91+
mcp-probe:
92+
name: mcp transport probe (${{ matrix.settings.name }})
93+
strategy:
94+
fail-fast: false
95+
matrix:
96+
settings:
97+
- name: linux
98+
host: blacksmith-4vcpu-ubuntu-2404
99+
- name: windows
100+
host: blacksmith-4vcpu-windows-2025
101+
runs-on: ${{ matrix.settings.host }}
102+
defaults:
103+
run:
104+
shell: bash
105+
steps:
106+
- name: Checkout repository
107+
uses: actions/checkout@v4
108+
with:
109+
token: ${{ secrets.GITHUB_TOKEN }}
110+
111+
- name: Setup Bun
112+
uses: ./.github/actions/setup-bun
113+
114+
- name: Run MCP transport probe
115+
working-directory: packages/opencode
116+
run: bun run test:mcp-probe
117+
timeout-minutes: 2
118+
91119
e2e:
92120
name: e2e (${{ matrix.settings.name }})
93121
strategy:

packages/opencode/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
"scripts": {
99
"typecheck": "tsgo --noEmit",
1010
"test": "bun test --timeout 30000",
11+
"test:mcp-probe": "bun run test/mcp/transport-error-probe.mjs",
1112
"test:ci": "mkdir -p .artifacts/unit && bun test --timeout 30000 --reporter=junit --reporter-outfile=.artifacts/unit/junit.xml",
1213
"build": "bun run script/build.ts",
1314
"fix-node-pty": "bun run script/fix-node-pty.ts",
@@ -182,4 +183,4 @@
182183
"overrides": {
183184
"drizzle-orm": "catalog:"
184185
}
185-
}
186+
}

packages/opencode/src/mcp/index.ts

Lines changed: 72 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import { dynamicTool, type Tool, jsonSchema, type JSONSchema7 } from "ai"
22
import { Client } from "@modelcontextprotocol/sdk/client/index.js"
33
import { StreamableHTTPClientTransport } from "@modelcontextprotocol/sdk/client/streamableHttp.js"
4+
import { isTransportError } from "./transport-error"
45
import { SSEClientTransport } from "@modelcontextprotocol/sdk/client/sse.js"
56
import { StdioClientTransport } from "@modelcontextprotocol/sdk/client/stdio.js"
67
import { UnauthorizedError } from "@modelcontextprotocol/sdk/client/auth.js"
@@ -119,37 +120,6 @@ function remoteURL(key: string, value: string) {
119120
log.warn("invalid remote mcp url", { key })
120121
}
121122

122-
// Convert MCP tool definition to AI SDK Tool type
123-
function convertMcpTool(mcpTool: MCPToolDef, client: MCPClient, timeout?: number): Tool {
124-
const inputSchema = mcpTool.inputSchema
125-
126-
// Spread first, then override type to ensure it's always "object"
127-
const schema: JSONSchema7 = {
128-
...(inputSchema as JSONSchema7),
129-
type: "object",
130-
properties: (inputSchema.properties ?? {}) as JSONSchema7["properties"],
131-
additionalProperties: false,
132-
}
133-
134-
return dynamicTool({
135-
description: mcpTool.description ?? "",
136-
inputSchema: jsonSchema(schema),
137-
execute: async (args: unknown) => {
138-
return client.callTool(
139-
{
140-
name: mcpTool.name,
141-
arguments: (args || {}) as Record<string, unknown>,
142-
},
143-
CallToolResultSchema,
144-
{
145-
resetTimeoutOnProgress: true,
146-
timeout,
147-
},
148-
)
149-
},
150-
})
151-
}
152-
153123
function defs(key: string, client: MCPClient, timeout?: number) {
154124
return Effect.tryPromise({
155125
try: () => withTimeout(client.listTools(), timeout ?? DEFAULT_TIMEOUT),
@@ -243,6 +213,8 @@ export const layer = Layer.effect(
243213
const spawner = yield* ChildProcessSpawner.ChildProcessSpawner
244214
const auth = yield* McpAuth.Service
245215
const bus = yield* Bus.Service
216+
const layerBridge = yield* EffectBridge.make()
217+
const reconnecting = new Map<string, Promise<boolean>>()
246218

247219
type Transport = StdioClientTransport | StreamableHTTPClientTransport | SSEClientTransport
248220

@@ -635,6 +607,69 @@ export const layer = Layer.effect(
635607
const config = cfg.mcp ?? {}
636608
const defaultTimeout = cfg.experimental?.mcp_timeout
637609

610+
// Single-flight reconnect: concurrent tool calls for the same MCP name
611+
// share one in-flight Promise instead of each triggering a new connect.
612+
// The entry is removed on both success and failure.
613+
const reconnectClient = (name: string): Promise<boolean> => {
614+
const existing = reconnecting.get(name)
615+
if (existing) return existing
616+
const p = layerBridge
617+
.promise(getMcpConfig(name))
618+
.then((mcp) => {
619+
if (!mcp) return false
620+
return layerBridge
621+
.promise(createAndStore(name, { ...mcp, enabled: true }))
622+
.then((status) => status.status === "connected")
623+
})
624+
.catch((err) => {
625+
log.error("mcp reconnect failed", { name, error: err instanceof Error ? err.message : String(err) })
626+
return false
627+
})
628+
.finally(() => {
629+
reconnecting.delete(name)
630+
})
631+
reconnecting.set(name, p)
632+
return p
633+
}
634+
635+
// Wraps an MCP tool as an AI SDK dynamicTool. The key piece is the
636+
// catch branch in execute: on a transport error, call reconnectClient
637+
// and retry once with the fresh client. Non-transport errors and
638+
// failed reconnects are rethrown as-is so business errors stay visible.
639+
const makeTool = (clientName: string, mcpTool: MCPToolDef, client: MCPClient, timeout?: number): Tool => {
640+
const schema: JSONSchema7 = {
641+
...(mcpTool.inputSchema as JSONSchema7),
642+
type: "object",
643+
properties: (mcpTool.inputSchema.properties ?? {}) as JSONSchema7["properties"],
644+
additionalProperties: false,
645+
}
646+
return dynamicTool({
647+
description: mcpTool.description ?? "",
648+
inputSchema: jsonSchema(schema),
649+
execute: (args: unknown) => {
650+
const payload = {
651+
name: mcpTool.name,
652+
arguments: (args || {}) as Record<string, unknown>,
653+
}
654+
const opts = { resetTimeoutOnProgress: true, timeout }
655+
return client.callTool(payload, CallToolResultSchema, opts).catch(async (e) => {
656+
if (!isTransportError(e)) throw e
657+
log.warn("mcp transport error, attempting reconnect", {
658+
clientName,
659+
tool: mcpTool.name,
660+
error: e instanceof Error ? e.message : String(e),
661+
})
662+
const ok = await reconnectClient(clientName)
663+
if (!ok) throw e
664+
const next = await layerBridge.promise(InstanceState.get(state))
665+
const fresh = next.clients[clientName]
666+
if (!fresh || next.status[clientName]?.status !== "connected") throw e
667+
return fresh.callTool(payload, CallToolResultSchema, opts)
668+
})
669+
},
670+
})
671+
}
672+
638673
const connectedClients = Object.entries(s.clients).filter(
639674
([clientName]) => s.status[clientName]?.status === "connected",
640675
)
@@ -654,7 +689,12 @@ export const layer = Layer.effect(
654689

655690
const timeout = entry?.timeout ?? defaultTimeout
656691
for (const mcpTool of listed) {
657-
result[sanitize(clientName) + "_" + sanitize(mcpTool.name)] = convertMcpTool(mcpTool, client, timeout)
692+
result[sanitize(clientName) + "_" + sanitize(mcpTool.name)] = makeTool(
693+
clientName,
694+
mcpTool,
695+
client,
696+
timeout,
697+
)
658698
}
659699
}),
660700
{ concurrency: "unbounded" },
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import { StreamableHTTPError } from "@modelcontextprotocol/sdk/client/streamableHttp.js"
2+
3+
// Fast-path unit tests live at `test/mcp/transport-error.test.ts`; they feed
4+
// synthetic error objects into `isTransportError`. Whenever this classifier
5+
// changes, also run `bun run test:mcp-probe` (test/mcp/transport-error-probe.mjs)
6+
// to verify those synthetic shapes still match what real servers / sockets
7+
// emit under `bun` and `node`.
8+
9+
const TRANSPORT_ERROR_CODES = new Set([
10+
// Node / Undici
11+
"ECONNRESET",
12+
"ECONNREFUSED",
13+
"ETIMEDOUT",
14+
"EHOSTUNREACH",
15+
"ENOTFOUND",
16+
"EPIPE",
17+
"ECONNABORTED",
18+
"UND_ERR_SOCKET",
19+
"UND_ERR_CLOSED",
20+
// Bun (uses PascalCase identifiers on err.code)
21+
"ConnectionRefused",
22+
"ConnectionReset",
23+
"ConnectionAborted",
24+
"ConnectionClosed",
25+
"Timeout",
26+
"SocketClosed",
27+
"NotConnected",
28+
"FailedToOpenSocket",
29+
])
30+
31+
export function isTransportError(e: unknown): boolean {
32+
if (e instanceof StreamableHTTPError) {
33+
// -1 = SDK protocol-level breakage (unexpected content-type, etc.)
34+
if (e.code === -1) return true
35+
if (typeof e.code !== "number") return false
36+
// 401/403 belong to auth flow, not transport
37+
if (e.code === 401 || e.code === 403) return false
38+
// Anything else 4xx (stale session 404, bad request 400, gone 410, ...) or 5xx counts
39+
return e.code >= 400
40+
}
41+
if (!(e instanceof Error)) return false
42+
const err = e as Error & { code?: string; cause?: { code?: string } }
43+
if (err.cause?.code && TRANSPORT_ERROR_CODES.has(err.cause.code)) return true
44+
if (err.code && TRANSPORT_ERROR_CODES.has(err.code)) return true
45+
if (err.message.includes("fetch failed")) return true
46+
if (err.message.includes("Unable to connect")) return true
47+
return false
48+
}

packages/opencode/test/mcp/headers.test.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,14 @@ void mock.module("@modelcontextprotocol/sdk/client/streamableHttp.js", () => ({
2323
throw new Error("Mock transport cannot connect")
2424
}
2525
},
26+
// Re-export so `src/mcp/transport-error.ts` can still `instanceof`-check.
27+
StreamableHTTPError: class StreamableHTTPError extends Error {
28+
readonly code: number | undefined
29+
constructor(code: number | undefined, message: string | undefined) {
30+
super(message)
31+
this.code = code
32+
}
33+
},
2634
}))
2735

2836
void mock.module("@modelcontextprotocol/sdk/client/sse.js", () => ({

0 commit comments

Comments
 (0)