@pinecall/voice-core
v0.2.6
Published
Framework-agnostic WebRTC voice session client for Pinecall agents
Readme
Table of Contents
- Install
- Quick Start
- API Reference
- TypeScript Types
Install
npm install @pinecall/voice-coreZero runtime dependencies. Browser-only (requires
RTCPeerConnection,getUserMedia).
Quick Start
import { VoiceSession } from "@pinecall/voice-core";
const session = new VoiceSession({ agent: "mara" });
// React-style: subscribe to all state changes
session.subscribe(() => {
const { status, phase, messages } = session.getState();
console.log(status, phase, messages);
});
// Or event-style: listen to specific events
session.addEventListener("message", (e) => {
console.log("New message:", e.detail.message);
});
session.addEventListener("event", (e) => {
// Raw DataChannel event from the server
console.log("Raw event:", e.detail);
});
await session.connect();
// Later...
session.disconnect();API Reference
new VoiceSession(options)
Creates a new voice session instance. Does not connect automatically.
interface VoiceSessionOptions {
/** Agent ID to connect to */
agent: string;
/**
* Pinecall API base URL for token exchange.
* Default: "https://voice.pinecall.io"
* Only override for self-hosted deployments.
*/
server?: string;
/**
* Session config overrides sent in the WebRTC offer body.
* Use this for per-session voice, STT, language, turn detection, and greeting.
* Format follows Pinecall's shortcut syntax (resolved server-side).
*
* @example
* ```ts
* config: {
* voice: "elevenlabs:EXAVITQu4vr4xnSDxMaL",
* stt: { provider: "deepgram", model: "nova-3", language: "es" },
* language: "es",
* turnDetection: "smart_turn",
* greeting: "¡Hola! ¿En qué puedo ayudarte?",
* }
* ```
*/
config?: Record<string, unknown>;
/** Metadata passed to the agent (available in call.metadata server-side) */
metadata?: Record<string, unknown>;
}Methods
session.connect(): Promise<void>
Initiates the WebRTC connection. Full flow:
- Fetches a short-lived token from
GET /webrtc/token?agent_id=<agent> - Fetches ICE servers from
GET /webrtc/ice-servers(falls back to Google STUN) - Requests microphone access (
getUserMedia) - Creates
RTCPeerConnection, adds mic track, creates DataChannel - Generates SDP offer, gathers ICE candidates
- Sends offer to
POST /webrtc/offerwith token - Sets remote SDP answer → connection established
State transitions: idle → connecting → connected (or error).
await session.connect();session.disconnect(): void
Closes the WebRTC connection, stops the microphone, clears timers. State returns to idle. Messages are preserved.
session.disconnect();session.toggleMute(): void
Toggles the microphone. When muted, the audio track is disabled and a { action: "mute" } message is sent to the server via DataChannel so it stops processing audio server-side too.
session.toggleMute();session.setMuted(muted: boolean): void
Explicit mute/unmute control.
session.setMuted(true); // mute
session.setMuted(false); // unmutesession.getState(): Readonly<VoiceSessionState>
Returns the current state snapshot. The reference is stable — it only changes when state mutates (safe for useSyncExternalStore).
const { status, phase, messages, isMuted, duration } = session.getState();session.subscribe(listener): () => void
Subscribes to all state changes. Returns an unsubscribe function. Designed for React's useSyncExternalStore.
const unsub = session.subscribe(() => {
console.log(session.getState());
});
// Later:
unsub();session.destroy(): void
Disconnects, clears all subscribers, and makes the instance unusable. Call this on component unmount.
session.destroy();session.configure(config): void
Sends a mid-call configuration update via DataChannel. Use this for live language/voice/STT switching during an active call. The server will hot-swap providers without disconnecting.
// Switch to Spanish mid-call
session.configure({
voice: "elevenlabs:h2cd3gvcqTp3m65Dysk7",
stt: { provider: "deepgram", model: "nova-3", language: "es" },
language: "es",
});session.updateOptions(patch): void
Updates session options before the next connect() call. Has no effect on an already-connected session — use configure() for that.
// Pre-connect: user selected Spanish in the UI
session.updateOptions({
config: {
voice: "elevenlabs:h2cd3gvcqTp3m65Dysk7",
language: "es",
greeting: "¡Hola!",
},
});
await session.connect(); // will use the Spanish configState
interface VoiceSessionState {
/** Connection status */
status: "idle" | "connecting" | "connected" | "error";
/** Error message (when status is "error") */
error: string | null;
/** Whether the microphone is muted */
isMuted: boolean;
/** Current call phase — what the conversation is doing right now */
phase: "idle" | "listening" | "speaking" | "pause" | "thinking";
/** Whether the user is currently speaking (VAD/STT active) */
userSpeaking: boolean;
/** Whether the agent is currently speaking (TTS playing) */
agentSpeaking: boolean;
/** Call duration in seconds (updates every second) */
duration: number;
/** Full conversation transcript — user + bot messages */
messages: TranscriptMessage[];
/** Idle warning countdown — seconds until timeout (null if no warning active) */
idleWarning: number | null;
}Call Phases
| Phase | Meaning | Triggered by |
|-------|---------|-------------|
| idle | Not in a call | Initial state, after disconnect |
| listening | Mic is hot, waiting for speech | Connection established, after bot finishes, after turn.resumed |
| speaking | Agent is speaking (TTS playing) | First bot.word event |
| thinking | Processing user input, waiting for LLM | user.message (STT final), turn.end |
| pause | Turn detection pause — user may still be talking | turn.pause (brief silence detected) |
Transcript Messages
interface TranscriptMessage {
/** Unique ID (timestamp-based) */
id: number;
/** Who said it */
role: "user" | "bot";
/** The text content */
text: string;
/** User only: STT is still processing (partial result) */
isInterim?: boolean;
/** Bot only: TTS is currently playing this message */
speaking?: boolean;
/** Bot only: user interrupted before the message finished */
interrupted?: boolean;
/** Bot only: server-assigned ID for word-by-word tracking */
messageId?: string;
}Message lifecycle — User:
user.speaking→ creates message withisInterim: true, text updates as STT refinesuser.message→ setsisInterim: falsewith final text
Message lifecycle — Bot:
bot.speaking→ creates empty message withspeaking: truebot.word(×N) → text builds word-by-word as TTS plays each wordbot.finished→ setsspeaking: false, optionally replaces text with final versionbot.interrupted→ setsspeaking: false,interrupted: true(user barged in)
Events (EventTarget)
VoiceSession extends EventTarget. You can listen to typed custom events:
"status" — Connection status changed
session.addEventListener("status", (e: CustomEvent) => {
console.log(e.detail.status); // "idle" | "connecting" | "connected" | "error"
});"phase" — Call phase changed
session.addEventListener("phase", (e: CustomEvent) => {
console.log(e.detail.phase); // "idle" | "listening" | "speaking" | "pause" | "thinking"
});"message" — Transcript message added or updated
Fires when a new message is added or an existing one is updated (partial STT, word-by-word bot text).
session.addEventListener("message", (e: CustomEvent) => {
const msg = e.detail.message; // TranscriptMessage
if (msg.role === "user" && !msg.isInterim) {
console.log("User said:", msg.text);
}
if (msg.role === "bot" && !msg.speaking) {
console.log("Bot finished saying:", msg.text);
}
});"error" — An error occurred
session.addEventListener("error", (e: CustomEvent) => {
console.error("Voice error:", e.detail.error);
});"change" — Any state change
Fires on every state mutation. The full state is in e.detail.state.
session.addEventListener("change", (e: CustomEvent) => {
const state = e.detail.state; // VoiceSessionState
});"event" — Raw DataChannel event
This is the power-user event. Every JSON message from the server's DataChannel is forwarded as-is. Use this to access events that the state machine doesn't expose — like tool calls, function results, audio metrics, or custom events your agent emits.
session.addEventListener("event", (e: CustomEvent) => {
const raw = e.detail; // any — the raw JSON from the server
console.log(raw.event, raw);
});DataChannel Protocol
The WebRTC DataChannel ("events", ordered) carries JSON messages between client and server. The client sends pings and mute/unmute commands. The server sends the following events:
Speech Detection (STT)
| Event | Fields | Description |
|-------|--------|-------------|
| speech.started | — | User started physically speaking (VAD detected voice) |
| speech.ended | — | User stopped speaking (VAD silence) |
| user.speaking | text | STT partial/interim result — text may change |
| user.message | text | STT final result — text is locked, turn is over |
Turn Detection
| Event | Fields | Description |
|-------|--------|-------------|
| turn.pause | — | Brief silence detected — user might still be talking |
| turn.end | — | Silence confirmed — user's turn is over, LLM starts |
| turn.resumed | — | User started speaking again during the pause |
Bot Speech (TTS)
| Event | Fields | Description |
|-------|--------|-------------|
| bot.speaking | message_id, text | TTS generation started. text has the full response but the widget intentionally starts empty and builds word-by-word. |
| bot.word | message_id, word, word_index | A single word was spoken by TTS. Arrives in real-time as audio plays. |
| bot.finished | message_id, text | TTS completed normally. text has the final complete response. |
| bot.interrupted | message_id | User barged in — TTS was cut short. |
Audio Metrics
| Event | Fields | Description |
|-------|--------|-------------|
| audio.metrics | source, is_speech, level | Server-side audio analysis. source is "user" or "bot". |
LLM / Tool Events (via "event" listener)
These events are not processed by the state machine but are forwarded through the "event" listener. They come from the Pinecall pipeline's LLM handler:
| Event | Fields | Description |
|-------|--------|-------------|
| llm.thinking | — | LLM started generating a response |
| llm.tool_call | tool_name, arguments, call_id | LLM requested a tool/function call |
| llm.tool_result | call_id, result | Tool execution result sent back to LLM |
| llm.response | text, finish_reason | LLM finished generating (text may be empty if tool-only) |
| llm.error | error | LLM error occurred |
Session Limits
| Event | Fields | Description |
|-------|--------|-------------|
| session.idle_warning | remaining_seconds | User hasn't spoken — call will timeout in remaining_seconds. Triggers idleWarning state. |
| session.timeout | reason | Session timed out ("idle_timeout" or "max_duration"). Client auto-disconnects. |
Example — Monitoring tool calls:
session.addEventListener("event", (e) => {
const { event, tool_name, arguments: args, result } = e.detail;
if (event === "llm.tool_call") {
console.log(`Agent calling ${tool_name}(${JSON.stringify(args)})`);
}
if (event === "llm.tool_result") {
console.log(`Tool result:`, result);
}
});Client → Server Messages
The client sends these through the DataChannel:
| Message | Format | Description |
|---------|--------|-------------|
| Ping | "ping" (string) | Keepalive, sent every 1s |
| Mute | { "action": "mute" } | Stop processing user audio server-side |
| Unmute | { "action": "unmute" } | Resume processing user audio |
| Configure | { "action": "configure", ...config } | Hot-swap voice, STT, language, or turn detection mid-call |
| Inject Text | { "action": "inject_text", "text": "..." } | Send text as if the user spoke it (for tool UI interactions) |
| Set Context | { "action": "set_context", "key": "...", "value": "..." } | Inject/update keyed context in the LLM prompt |
Usage Patterns
Vanilla JavaScript
import { VoiceSession } from "@pinecall/voice-core";
const session = new VoiceSession({ agent: "florencia" });
// UI binding
const btn = document.getElementById("call-btn")!;
const transcript = document.getElementById("transcript")!;
btn.onclick = async () => {
if (session.getState().status === "connected") {
session.disconnect();
btn.textContent = "Start Call";
} else {
await session.connect();
btn.textContent = "End Call";
}
};
session.addEventListener("message", (e) => {
const msg = e.detail.message;
const div = document.createElement("div");
div.className = msg.role;
div.textContent = `${msg.role}: ${msg.text}`;
transcript.appendChild(div);
});
session.addEventListener("phase", (e) => {
document.body.dataset.phase = e.detail.phase;
});React with useSyncExternalStore
import { useSyncExternalStore, useCallback, useState, useEffect } from "react";
import { VoiceSession } from "@pinecall/voice-core";
function useVoiceSession(agent: string) {
const [session] = useState(() => new VoiceSession({ agent }));
const state = useSyncExternalStore(
useCallback((cb) => session.subscribe(cb), [session]),
() => session.getState(),
);
useEffect(() => () => session.destroy(), [session]);
return { ...state, session };
}Vue 3 Composable
import { ref, onUnmounted } from "vue";
import { VoiceSession } from "@pinecall/voice-core";
export function useVoiceSession(agent: string) {
const session = new VoiceSession({ agent });
const state = ref(session.getState());
session.subscribe(() => {
state.value = session.getState();
});
onUnmounted(() => session.destroy());
return { state, session };
}Svelte Store
import { readable } from "svelte/store";
import { VoiceSession } from "@pinecall/voice-core";
export function createVoiceSession(agent: string) {
const session = new VoiceSession({ agent });
const state = readable(session.getState(), (set) => {
return session.subscribe(() => set(session.getState()));
});
return { state, session };
}WebRTC Connection Flow
TypeScript Types
All types are exported from the package:
import type {
VoiceSessionOptions,
VoiceSessionState,
SessionStatus, // "idle" | "connecting" | "connected" | "error"
CallPhase, // "idle" | "listening" | "speaking" | "pause" | "thinking"
TranscriptMessage,
} from "@pinecall/voice-core";