From e2c3d899829b63c0449d676f00d42ef669e8f903 Mon Sep 17 00:00:00 2001 From: shubh24 Date: Mon, 1 Jun 2026 11:55:03 -0700 Subject: [PATCH] Add ElevenLabs integration: voice agent + browser agent Adds a standalone Next.js example under examples/integrations/elevenlabs showing ElevenLabs as the voice shell driving a Browserbase-backed browser agent. A single controller owns browser state: ElevenLabs handles voice, the Claude Agent SDK plans steps, and execution runs through the Browserbase browse CLI against a shared persistent session rendered in a live iframe. Includes the Browserbase-branded UI, API routes, controller logic, .env.example, and local setup docs. Also lists the integration in the monorepo README. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 1 + examples/integrations/elevenlabs/.env.example | 5 + examples/integrations/elevenlabs/.gitignore | 7 + examples/integrations/elevenlabs/README.md | 57 + .../elevenlabs/app/api/demo/control/route.ts | 22 + .../elevenlabs/app/api/demo/session/route.ts | 15 + .../elevenlabs/app/api/demo/stream/route.ts | 48 + .../elevenlabs/app/demo-client.tsx | 521 ++++++ .../integrations/elevenlabs/app/globals.css | 405 +++++ .../integrations/elevenlabs/app/layout.tsx | 19 + examples/integrations/elevenlabs/app/page.tsx | 5 + .../elevenlabs/lib/demo-controller.ts | 1517 +++++++++++++++++ .../integrations/elevenlabs/lib/demo-types.ts | 58 + .../integrations/elevenlabs/next-env.d.ts | 6 + .../integrations/elevenlabs/next.config.ts | 9 + examples/integrations/elevenlabs/package.json | 26 + .../integrations/elevenlabs/tsconfig.json | 26 + 17 files changed, 2747 insertions(+) create mode 100644 examples/integrations/elevenlabs/.env.example create mode 100644 examples/integrations/elevenlabs/.gitignore create mode 100644 examples/integrations/elevenlabs/README.md create mode 100644 examples/integrations/elevenlabs/app/api/demo/control/route.ts create mode 100644 examples/integrations/elevenlabs/app/api/demo/session/route.ts create mode 100644 examples/integrations/elevenlabs/app/api/demo/stream/route.ts create mode 100644 examples/integrations/elevenlabs/app/demo-client.tsx create mode 100644 examples/integrations/elevenlabs/app/globals.css create mode 100644 examples/integrations/elevenlabs/app/layout.tsx create mode 100644 examples/integrations/elevenlabs/app/page.tsx create mode 100644 examples/integrations/elevenlabs/lib/demo-controller.ts create mode 100644 examples/integrations/elevenlabs/lib/demo-types.ts create mode 100644 examples/integrations/elevenlabs/next-env.d.ts create mode 100644 examples/integrations/elevenlabs/next.config.ts create mode 100644 examples/integrations/elevenlabs/package.json create mode 100644 examples/integrations/elevenlabs/tsconfig.json diff --git a/README.md b/README.md index d9397ce..f88713d 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ integrations/ │ ├── cartesia/ # Cartesia integration │ ├── cloudflare/ # Cloudflare integration │ ├── crewai/ # CrewAI framework integration +│ ├── elevenlabs/ # ElevenLabs voice agent + browser agent │ ├── langchain/ # LangChain framework integration │ ├── logs/ # Logging utilities │ ├── mastra/ # Mastra AI agent integration diff --git a/examples/integrations/elevenlabs/.env.example b/examples/integrations/elevenlabs/.env.example new file mode 100644 index 0000000..ca86958 --- /dev/null +++ b/examples/integrations/elevenlabs/.env.example @@ -0,0 +1,5 @@ +ANTHROPIC_API_KEY= +BROWSERBASE_API_KEY= +BROWSERBASE_PROJECT_ID= +NEXT_PUBLIC_ELEVENLABS_AGENT_ID= +BROWSE_BIN=browse diff --git a/examples/integrations/elevenlabs/.gitignore b/examples/integrations/elevenlabs/.gitignore new file mode 100644 index 0000000..2a182d6 --- /dev/null +++ b/examples/integrations/elevenlabs/.gitignore @@ -0,0 +1,7 @@ +.next +node_modules +.env +.env.local +.env.*.local +.DS_Store +tsconfig.tsbuildinfo diff --git a/examples/integrations/elevenlabs/README.md b/examples/integrations/elevenlabs/README.md new file mode 100644 index 0000000..703907d --- /dev/null +++ b/examples/integrations/elevenlabs/README.md @@ -0,0 +1,57 @@ +# ElevenLabs + Browserbase + +Standalone prototype for a shared Browserbase session controlled by a local Claude-based browser controller, with ElevenLabs as the voice shell. + +## Required environment variables + +Create `.env.local` or `.env` with: + +```bash +ANTHROPIC_API_KEY= +BROWSERBASE_API_KEY= +BROWSERBASE_PROJECT_ID= +NEXT_PUBLIC_ELEVENLABS_AGENT_ID= +``` + +Optional: + +```bash +BROWSE_BIN=browse +``` + +## Run locally + +```bash +pnpm install +pnpm dev +``` + +Open: + +```text +http://127.0.0.1:3001 +``` + +The controller expects the Browserbase Browse CLI to be installed and available on your `PATH`. If it is installed somewhere else, point `BROWSE_BIN` at it. + +## ElevenLabs agent setup + +The frontend registers one client tool: + +- `control_demo` + +Suggested tool description for your ElevenLabs agent: + +> Use this tool whenever the user asks you to navigate, click, open, read, create, edit, or continue operating the live browser session. Pass one high-level instruction at a time. + +Suggested system guidance: + +> You are the voice interface for a live Browserbase browser controller. The browser controller owns all navigation, clicking, reading, and page state. Use `control_demo` once for each new browser instruction from the user. After `control_demo` returns `accepted`, `running`, `queued`, or `interrupting`, give at most one short acknowledgement, then wait for controller updates. Never ask "are you there" while the controller is busy. When the controller returns `completed`, answer concisely using the final summary and current page state. When the controller returns `blocked`, ask the clarification once instead of retrying the same tool call. + +Do not pass this as a runtime prompt override unless that override is explicitly enabled in the ElevenLabs agent settings; otherwise the session may immediately disconnect. + +## Controller model + +The local controller uses Claude Agent SDK as a step planner pinned to `claude-opus-4-7`, while browser execution runs through the Browserbase `browse` CLI against the same persistent Browserbase session used for the live iframe. + +That means the controller plans from `browse snapshot` output, clicks by stable refs like `@0-5`, and can follow tab changes through the CLI instead of relying on fuzzy Playwright text matching. diff --git a/examples/integrations/elevenlabs/app/api/demo/control/route.ts b/examples/integrations/elevenlabs/app/api/demo/control/route.ts new file mode 100644 index 0000000..ccf6696 --- /dev/null +++ b/examples/integrations/elevenlabs/app/api/demo/control/route.ts @@ -0,0 +1,22 @@ +import { NextResponse } from "next/server"; +import { z } from "zod"; +import { runDemoInstruction } from "../../../../lib/demo-controller"; + +export const runtime = "nodejs"; + +const bodySchema = z.object({ + demoId: z.string().uuid(), + instruction: z.string().min(3), + interrupt: z.boolean().optional() +}); + +export async function POST(request: Request) { + try { + const parsed = bodySchema.parse(await request.json()); + const snapshot = await runDemoInstruction(parsed); + return NextResponse.json(snapshot); + } catch (error) { + const message = error instanceof Error ? error.message : "Demo control failed."; + return NextResponse.json({ error: message }, { status: 500 }); + } +} diff --git a/examples/integrations/elevenlabs/app/api/demo/session/route.ts b/examples/integrations/elevenlabs/app/api/demo/session/route.ts new file mode 100644 index 0000000..e364798 --- /dev/null +++ b/examples/integrations/elevenlabs/app/api/demo/session/route.ts @@ -0,0 +1,15 @@ +import { NextResponse } from "next/server"; +import { getDemoSnapshot } from "../../../../lib/demo-controller"; + +export const runtime = "nodejs"; + +export async function GET(request: Request) { + const url = new URL(request.url); + const demoId = url.searchParams.get("demoId"); + + if (!demoId) { + return NextResponse.json({ error: "Missing demoId." }, { status: 400 }); + } + + return NextResponse.json(getDemoSnapshot(demoId)); +} diff --git a/examples/integrations/elevenlabs/app/api/demo/stream/route.ts b/examples/integrations/elevenlabs/app/api/demo/stream/route.ts new file mode 100644 index 0000000..335f11c --- /dev/null +++ b/examples/integrations/elevenlabs/app/api/demo/stream/route.ts @@ -0,0 +1,48 @@ +import { getDemoSnapshot, subscribeToDemo } from "../../../../lib/demo-controller"; + +export const runtime = "nodejs"; + +export async function GET(request: Request) { + const url = new URL(request.url); + const demoId = url.searchParams.get("demoId"); + + if (!demoId) { + return new Response("Missing demoId.", { status: 400 }); + } + + const encoder = new TextEncoder(); + + const stream = new ReadableStream({ + start(controller) { + const sendSnapshot = (snapshot = getDemoSnapshot(demoId)) => { + controller.enqueue( + encoder.encode(`event: snapshot\ndata: ${JSON.stringify(snapshot)}\n\n`) + ); + }; + + sendSnapshot(); + + const unsubscribe = subscribeToDemo(demoId, (snapshot) => { + sendSnapshot(snapshot); + }); + + const heartbeat = setInterval(() => { + controller.enqueue(encoder.encode("event: ping\ndata: {}\n\n")); + }, 15000); + + request.signal.addEventListener("abort", () => { + clearInterval(heartbeat); + unsubscribe(); + controller.close(); + }); + } + }); + + return new Response(stream, { + headers: { + "Cache-Control": "no-cache, no-transform", + Connection: "keep-alive", + "Content-Type": "text/event-stream" + } + }); +} diff --git a/examples/integrations/elevenlabs/app/demo-client.tsx b/examples/integrations/elevenlabs/app/demo-client.tsx new file mode 100644 index 0000000..2f29b6e --- /dev/null +++ b/examples/integrations/elevenlabs/app/demo-client.tsx @@ -0,0 +1,521 @@ +"use client"; + +import { + ConversationProvider, + useConversation, + useConversationClientTool, + useConversationStatus +} from "@elevenlabs/react"; +import { startTransition, useCallback, useEffect, useRef, useState } from "react"; +import type { DemoControlInput, DemoEvent, DemoSessionSnapshot } from "../lib/demo-types"; + +const EMPTY_SESSION: DemoSessionSnapshot = { + demoId: "", + activeRunId: null, + status: "idle", + busy: false, + liveViewUrl: null, + browserbaseSessionId: null, + claudeSessionId: null, + currentUrl: null, + pageTitle: null, + lastInstruction: null, + lastSummary: null, + currentStep: null, + lastNarration: null, + lastControlOutcome: null, + lastControlMessage: null, + queuedInstructionCount: 0, + queuedInstructions: [], + stepCount: 0, + error: null, + missingConfig: [], + events: [] +}; + +type TranscriptLine = { + id: string; + role: "user" | "agent" | "system"; + text: string; + createdAt: string; +}; + +export function DemoClient() { + return ( + + + + ); +} + +function DemoShell() { + const [demoId] = useState(() => crypto.randomUUID()); + const [session, setSession] = useState({ + ...EMPTY_SESSION, + demoId + }); + const [uiError, setUiError] = useState(null); + const [transcript, setTranscript] = useState([]); + const [toolBusy, setToolBusy] = useState(false); + const lastVoiceContextRef = useRef(null); + + const appendTranscriptLine = useCallback((role: TranscriptLine["role"], text: string) => { + setTranscript((current) => [ + ...current, + { + id: crypto.randomUUID(), + role, + text, + createdAt: new Date().toISOString() + } + ]); + }, []); + + const { + startSession, + endSession, + sendContextualUpdate + } = useConversation({ + onError: (error: unknown) => { + appendTranscriptLine("system", formatConversationError(error)); + }, + onConnect: (details: unknown) => { + const conversationId = + details && typeof details === "object" && "conversationId" in details + ? String(details.conversationId) + : "connected"; + appendTranscriptLine("system", `Voice connected: ${conversationId}`); + }, + onDisconnect: (details: unknown) => { + const reason = + details && typeof details === "object" && "reason" in details && typeof details.reason === "string" + ? details.reason + : "disconnected"; + appendTranscriptLine("system", `Voice session ended: ${reason}`); + }, + onStatusChange: (details: unknown) => { + const status = + details && typeof details === "object" && "status" in details && typeof details.status === "string" + ? details.status + : null; + + if (!status || status === "connecting" || status === "connected") { + return; + } + + appendTranscriptLine("system", `Voice status: ${status}`); + }, + onDebug: (info: unknown) => { + console.info("[voice-debug]", info); + }, + onMessage: (message: unknown) => { + const normalized = normalizeConversationMessage(message); + if (!normalized) { + return; + } + + setTranscript((current) => [...current, normalized]); + } + }); + + const { status: voiceStatus, message: voiceStatusMessage } = useConversationStatus(); + const hasVoiceAgent = Boolean(process.env.NEXT_PUBLIC_ELEVENLABS_AGENT_ID); + const voiceConnected = voiceStatus === "connected"; + const voiceSessionActive = voiceStatus === "connecting" || voiceConnected; + const voiceStatusHint = + voiceStatusMessage === "Permission denied" + ? "Allow microphone access for this site. If the in-app browser does not surface a mic prompt, open the demo in Chrome and grant mic access there." + : null; + + const refreshSession = useCallback(async () => { + const response = await fetch(`/api/demo/session?demoId=${encodeURIComponent(demoId)}`); + if (!response.ok) { + return; + } + + const nextSession = (await response.json()) as DemoSessionSnapshot; + setSession(nextSession); + }, [demoId]); + + useEffect(() => { + void refreshSession(); + }, [refreshSession]); + + useEffect(() => { + const source = new EventSource(`/api/demo/stream?demoId=${encodeURIComponent(demoId)}`); + + const onSnapshot = (event: Event) => { + const messageEvent = event as MessageEvent; + const nextSession = JSON.parse(messageEvent.data) as DemoSessionSnapshot; + setSession(nextSession); + }; + + source.addEventListener("snapshot", onSnapshot); + source.onerror = () => { + void refreshSession(); + }; + + return () => { + source.removeEventListener("snapshot", onSnapshot); + source.close(); + }; + }, [demoId, refreshSession]); + + useEffect(() => { + if (!voiceConnected || !sendContextualUpdate) { + return; + } + + const context = buildVoiceContext(session); + if (!context || lastVoiceContextRef.current === context.key) { + return; + } + + lastVoiceContextRef.current = context.key; + void sendContextualUpdate(context.message); + }, [sendContextualUpdate, session, voiceConnected]); + + const startControllerRun = useCallback( + async (input: DemoControlInput) => { + setToolBusy(true); + setUiError(null); + + try { + const response = await fetch("/api/demo/control", { + method: "POST", + headers: { + "Content-Type": "application/json" + }, + body: JSON.stringify(input) + }); + + const payload = (await response.json()) as DemoSessionSnapshot | { error?: string }; + if (!response.ok) { + throw new Error("error" in payload ? payload.error ?? "Controller request failed." : "Controller request failed."); + } + + const nextSession = payload as DemoSessionSnapshot; + setSession(nextSession); + return nextSession; + } catch (error) { + const message = error instanceof Error ? error.message : "Controller request failed."; + setUiError(message); + throw error; + } finally { + setToolBusy(false); + startTransition(() => { + void refreshSession(); + }); + } + }, + [refreshSession] + ); + + useConversationClientTool( + "control_demo", + async (rawInput: { instruction?: string; interrupt?: boolean; goal?: string }) => { + const instruction = rawInput.instruction?.trim() || rawInput.goal?.trim(); + if (!instruction) { + throw new Error("Missing instruction for control_demo."); + } + + const snapshot = await startControllerRun({ + demoId, + instruction, + interrupt: rawInput.interrupt + }); + + return JSON.stringify({ + ok: true, + accepted: true, + runId: snapshot.activeRunId, + controllerState: snapshot.status, + controlOutcome: snapshot.lastControlOutcome, + controlMessage: snapshot.lastControlMessage, + queuedInstructionCount: snapshot.queuedInstructionCount, + voiceGuidance: buildToolVoiceGuidance(snapshot), + speakableUpdate: buildToolSpeakableUpdate(snapshot), + currentStep: snapshot.currentStep, + currentUrl: snapshot.currentUrl, + pageTitle: snapshot.pageTitle, + liveViewUrl: snapshot.liveViewUrl + }); + } + ); + + const handleVoiceStart = async () => { + const agentId = process.env.NEXT_PUBLIC_ELEVENLABS_AGENT_ID; + if (!agentId) { + setUiError("NEXT_PUBLIC_ELEVENLABS_AGENT_ID is missing."); + return; + } + + setUiError(null); + + try { + await startSession({ + agentId, + connectionType: "websocket", + useWakeLock: false + }); + } catch (error) { + const message = error instanceof Error ? error.message : "Voice session failed to start."; + setUiError(message); + } + }; + + const conversation = mergeConversation(session.events, transcript); + const latestConversation = [...conversation].reverse(); + + return ( +
+
+
+
Browserbase and ElevenLabs
+

Give your voice agent access to the whole web.

+

+ ElevenLabs handles the conversation. Browserbase keeps the live browser session attached to the controller + underneath it, so browsing, retries, and voice updates stay aligned. +

+

One controller owns the browser state. The voice layer stays in sync with it.

+
+ +
+
+
+ Voice session + {voiceStatus} +
+
+ + +
+
+

+ Start voice, then ask the agent to open, click, read, or continue working in the shared Browserbase + session. +

+ {session.busy ?

The controller is working on the latest request.

: null} + {toolBusy && !session.busy ?

Sending the latest instruction to the controller.

: null} + {voiceStatusMessage ?

{voiceStatusMessage}

: null} + {voiceStatusHint ?

{voiceStatusHint}

: null} + {uiError ?

{uiError}

: null} +
+ +
+ +
+
+
+
+ Live Browserbase session + {session.pageTitle ?? "Waiting for the first browser instruction"} +
+
+ {session.busy ? "Running" : "Idle"} +
+
+
+ {session.currentUrl ?? "No page yet"} +
+ + {session.liveViewUrl ? ( +