Skip to content

Commit 23441de

Browse files
authored
Openai localfunction fixes (#1293)
* Fixed readme link. * Fix local function example to only send session update after session create. * Switched Alice and Bob to Dad jokes.
1 parent 7d3b186 commit 23441de

9 files changed

Lines changed: 248 additions & 171 deletions

File tree

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
**Connect to OpenAI's Realtime WebRTC Endpoint**
99

10-
The [WebRTCOpenAI](https://github.com/sipsorcery-org/sipsorcery/blob/master/examples/WebRTCExamples/WebRTCOpenAI/Program.cs) demonstrates a dotnet only (no native libraries) applicaiton that connects to [OpenAI's new WebRTC Realtime](https://platform.openai.com/docs/guides/realtime-webrtc) endpoint. This demo lets you talk in realtime to ChatGPT and receive both a WebRTC audio stream response and a text transcript. Could video avatars be on the way?! A real Max Headroom!
10+
The [WebRTCOpenAI](https://github.com/sipsorcery-org/sipsorcery/blob/master/examples/OpenAIExamples/WebRTCOpenAI/Program.cs) demonstrates a dotnet only (no native libraries) applicaiton that connects to [OpenAI's new WebRTC Realtime](https://platform.openai.com/docs/guides/realtime-webrtc) endpoint. This demo lets you talk in realtime to ChatGPT and receive both a WebRTC audio stream response and a text transcript. Could video avatars be on the way?! A real Max Headroom!
1111

1212
![ChatGPT WebRTC Transcript](./img/openai.png)
1313

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
//-----------------------------------------------------------------------------
2+
// Filename: OpenAIDataChannelManager.cs
3+
//
4+
// Description: Helper methods to manage communications with an OpenAI WebRTC
5+
// peer connection
6+
//
7+
// Author(s):
8+
// Aaron Clauson ([email protected])
9+
//
10+
// History:
11+
// 23 Jan 2025 Aaron Clauson Created, Dublin, Ireland.
12+
//
13+
// License:
14+
// MIT.
15+
//-----------------------------------------------------------------------------
16+
17+
using LanguageExt;
18+
using System.Text.Json;
19+
using System.Text;
20+
21+
namespace demo;
22+
23+
public class OpenAIDataChannelManager
24+
{
25+
public static Option<OpenAIServerEventBase> ParseDataChannelMessage(byte[] data)
26+
{
27+
var message = Encoding.UTF8.GetString(data);
28+
29+
//logger.LogDebug($"Data channel message: {message}");
30+
31+
var serverEvent = JsonSerializer.Deserialize<OpenAIServerEventBase>(message, JsonOptions.Default);
32+
33+
if (serverEvent != null)
34+
{
35+
//logger.LogInformation($"Server event ID {serverEvent.EventID} and type {serverEvent.Type}.");
36+
37+
return serverEvent.Type switch
38+
{
39+
OpenAIConversationItemCreated.TypeName => JsonSerializer.Deserialize<OpenAIConversationItemCreated>(message, JsonOptions.Default),
40+
OpenAIInputAudioBufferCommitted.TypeName => JsonSerializer.Deserialize<OpenAIInputAudioBufferCommitted>(message, JsonOptions.Default),
41+
OpenAIInputAudioBufferSpeechStarted.TypeName => JsonSerializer.Deserialize<OpenAIInputAudioBufferSpeechStarted>(message, JsonOptions.Default),
42+
OpenAIInputAudioBufferSpeechStopped.TypeName => JsonSerializer.Deserialize<OpenAIInputAudioBufferSpeechStopped>(message, JsonOptions.Default),
43+
OpenAIOuputAudioBufferAudioStarted.TypeName => JsonSerializer.Deserialize<OpenAIOuputAudioBufferAudioStarted>(message, JsonOptions.Default),
44+
OpenAIOuputAudioBufferAudioStopped.TypeName => JsonSerializer.Deserialize<OpenAIOuputAudioBufferAudioStopped>(message, JsonOptions.Default),
45+
OpenAIRateLimitsUpdated.TypeName => JsonSerializer.Deserialize<OpenAIRateLimitsUpdated>(message, JsonOptions.Default),
46+
OpenAIResponseAudioDone.TypeName => JsonSerializer.Deserialize<OpenAIResponseAudioDone>(message, JsonOptions.Default),
47+
OpenAIResponseAudioTranscriptDelta.TypeName => JsonSerializer.Deserialize<OpenAIResponseAudioTranscriptDelta>(message, JsonOptions.Default),
48+
OpenAIResponseAudioTranscriptDone.TypeName => JsonSerializer.Deserialize<OpenAIResponseAudioTranscriptDone>(message, JsonOptions.Default),
49+
OpenAIResponseContentPartAdded.TypeName => JsonSerializer.Deserialize<OpenAIResponseContentPartAdded>(message, JsonOptions.Default),
50+
OpenAIResponseContentPartDone.TypeName => JsonSerializer.Deserialize<OpenAIResponseContentPartDone>(message, JsonOptions.Default),
51+
OpenAIResponseCreated.TypeName => JsonSerializer.Deserialize<OpenAIResponseCreated>(message, JsonOptions.Default),
52+
OpenAIResponseDone.TypeName => JsonSerializer.Deserialize<OpenAIResponseDone>(message, JsonOptions.Default),
53+
OpenAIResponseFunctionCallArgumentsDelta.TypeName => JsonSerializer.Deserialize<OpenAIResponseFunctionCallArgumentsDelta>(message, JsonOptions.Default),
54+
OpenAIResponseFunctionCallArgumentsDone.TypeName => JsonSerializer.Deserialize<OpenAIResponseFunctionCallArgumentsDone>(message, JsonOptions.Default),
55+
OpenAIResponseOutputItemAdded.TypeName => JsonSerializer.Deserialize<OpenAIResponseOutputItemAdded>(message, JsonOptions.Default),
56+
OpenAIResponseOutputItemDone.TypeName => JsonSerializer.Deserialize<OpenAIResponseOutputItemDone>(message, JsonOptions.Default),
57+
OpenAISessionCreated.TypeName => JsonSerializer.Deserialize<OpenAISessionCreated>(message, JsonOptions.Default),
58+
OpenAISessionUpdated.TypeName => JsonSerializer.Deserialize<OpenAISessionUpdated>(message, JsonOptions.Default),
59+
_ => Option<OpenAIServerEventBase>.None
60+
};
61+
62+
}
63+
64+
return Option<OpenAIServerEventBase>.None;
65+
}
66+
}

examples/OpenAIExamples/OpenAI.Realtime/OpenAIRealtimeRestClient.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
//-----------------------------------------------------------------------------
22
// Filename: OpenAIRealtimeRestClient.cs
33
//
4-
// Description:
4+
// Description: Used to send requests to the OpenAI Realtime REST server to
5+
// do the initial session set up and the SDP offer/answer exchange.
56
//
67
// Author(s):
78
// Aaron Clauson ([email protected])
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
namespace demo;
2+
3+
public enum OpenAIAudioFormatsEnum
4+
{
5+
pcm16,
6+
g711_ulaw,
7+
g711_alaw
8+
}

examples/OpenAIExamples/OpenAI.Realtime/RealtimeModels/Enums/OpenAIToolChoiceEnum.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
public enum OpenAIToolChoiceEnum
44
{
5-
Auto,
6-
None,
7-
Required
5+
auto,
6+
none,
7+
required
88
}

examples/OpenAIExamples/OpenAI.Realtime/RealtimeModels/OpenAISession.cs

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,18 +8,33 @@ namespace demo;
88

99
public class OpenAISession
1010
{
11+
[JsonPropertyName("modalities")]
12+
public string[] Modalities { get; set; } = { "audio", "text" };
13+
14+
[JsonPropertyName("instructions")]
15+
public string? Instructions { get; set; }
16+
1117
[JsonPropertyName("model")]
1218
public string? Model { get; set; }
1319

1420
[JsonPropertyName("voice")]
1521
public OpenAIVoicesEnum? Voice { get; set; }
1622

23+
[JsonPropertyName("input_audio_format")]
24+
public OpenAIAudioFormatsEnum InputAudioFormat { get; set; } = OpenAIAudioFormatsEnum.pcm16;
25+
26+
[JsonPropertyName("output_audio_format")]
27+
public OpenAIAudioFormatsEnum OutputAudioFormat { get; set; } = OpenAIAudioFormatsEnum.pcm16;
28+
1729
[JsonPropertyName("turn_detection")]
1830
public OpenAITurnDetection? TurnDetection { get; set; }
1931

2032
[JsonPropertyName("tools")]
2133
public List<OpenAITool>? Tools { get; set; }
2234

35+
[JsonPropertyName("tool_choice")]
36+
public OpenAIToolChoiceEnum ToolChoice { get; set; } = OpenAIToolChoiceEnum.auto;
37+
2338
public string ToJson()
2439
{
2540
return JsonSerializer.Serialize(this, JsonOptions.Default);

examples/OpenAIExamples/WebRTCOpenAI/Program.cs

Lines changed: 8 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ static async Task Main(string[] args)
151151

152152
// NOTE: If you want to trigger the convesation by using the audio from your microphone comment
153153
// out this line.
154-
SendResponseCreate(ctx.Pc.DataChannels.First(), OpenAIVoicesEnum.alloy, "Introduce urself.");
154+
SendResponseCreate(ctx.Pc.DataChannels.First(), OpenAIVoicesEnum.alloy, "Introduce urself. Keep it short.");
155155

156156
return ctx;
157157
})
@@ -276,33 +276,19 @@ private static async Task<RTCPeerConnection> CreatePeerConnection(SemaphoreSlim
276276
/// </summary>
277277
private static void OnDataChannelMessage(RTCDataChannel dc, DataChannelPayloadProtocols protocol, byte[] data)
278278
{
279-
//logger.LogInformation($"Data channel {dc.label}, protocol {protocol} message length {data.Length}.");
279+
logger.LogInformation($"Data channel {dc.label}, protocol {protocol} message length {data.Length}.");
280280

281281
var message = Encoding.UTF8.GetString(data);
282282
var serverEvent = JsonSerializer.Deserialize<OpenAIServerEventBase>(message, JsonOptions.Default);
283283

284-
if (serverEvent != null)
284+
var serverEventModel = OpenAIDataChannelManager.ParseDataChannelMessage(data);
285+
serverEventModel.IfSome(e =>
285286
{
286-
//logger.LogInformation($"Server event ID {serverEvent.EventID} and type {serverEvent.Type}.");
287-
288-
Option<OpenAIServerEventBase> serverEventModel = serverEvent.Type switch
287+
if (e is OpenAIResponseAudioTranscriptDone done)
289288
{
290-
"response.audio_transcript.delta" => JsonSerializer.Deserialize<OpenAIResponseAudioTranscriptDelta>(message, JsonOptions.Default),
291-
"response.audio_transcript.done" => JsonSerializer.Deserialize<OpenAIResponseAudioTranscriptDone>(message, JsonOptions.Default),
292-
_ => Option<OpenAIServerEventBase>.None
293-
};
289+
logger.LogInformation($"Transcript done: {done.Transcript}");
290+
}
291+
});
294292

295-
serverEventModel.IfSome(e =>
296-
{
297-
if (e is OpenAIResponseAudioTranscriptDone done)
298-
{
299-
logger.LogInformation($"Transcript done: {done.Transcript}");
300-
}
301-
});
302-
}
303-
else
304-
{
305-
logger.LogWarning($"Failed to parse server event for: {message}");
306-
}
307293
}
308294
}

examples/OpenAIExamples/WebRTCOpenAIAliceAndBob/Program.cs

Lines changed: 76 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,11 @@
2222
using System.Threading.Tasks;
2323
using System.Windows.Forms;
2424
using Microsoft.Extensions.Logging;
25-
using Microsoft.Extensions.Logging.Abstractions;
2625
using Serilog;
27-
using Serilog.Extensions.Logging;
2826
using SIPSorcery.Media;
2927
using SIPSorcery.Net;
3028
using AudioScope;
3129
using System.Numerics;
32-
using System.Text.Json;
3330
using System.Text;
3431
using SIPSorceryMedia.Windows;
3532
using LanguageExt;
@@ -70,6 +67,8 @@ class Program
7067
private const string OPENAI_REALTIME_BASE_URL = "https://api.openai.com/v1/realtime";
7168
private const string OPENAI_MODEL = "gpt-4o-realtime-preview-2024-12-17";
7269
private const string OPENAI_DATACHANNEL_NAME = "oai-events";
70+
private const string ALICE_CALL_LABEL = "Alice";
71+
private const string BOB_CALL_LABEL = "Bob";
7372

7473
private static Microsoft.Extensions.Logging.ILogger logger = LoggerFactory.Create(builder =>
7574
builder.AddSerilog(new LoggerConfiguration()
@@ -121,15 +120,15 @@ static async Task Main(string[] args)
121120

122121
// Initialise two peer connection contexts for Alice & Bob.
123122
var aliceInitialContext = new InitPcContext(
124-
"Alice",
123+
ALICE_CALL_LABEL,
125124
string.Empty,
126125
string.Empty,
127126
string.Empty,
128127
OpenAIVoicesEnum.shimmer,
129128
1);
130129

131130
var bobInitialContext = new InitPcContext(
132-
"Bob",
131+
BOB_CALL_LABEL,
133132
string.Empty,
134133
string.Empty,
135134
string.Empty,
@@ -181,10 +180,10 @@ from bobCtx in bobCallTask.Result
181180
// Trigger the conversation by getting Alice to say something witty.
182181
var aliceDataChannel = aliceConncectedCtx.Pc.DataChannels.Where(x => x.label == OPENAI_DATACHANNEL_NAME).Single();
183182

184-
if (aliceDataChannel != null)
185-
{
186-
SendResponseCreate(aliceDataChannel, OpenAIVoicesEnum.shimmer, "Only talk in cheesy puns. Keep it short once you'vegot you pun in. To start the conversation please repeat repeat this phrase in your corniest accent: 'You're a few tinnies short of a six-pack.'");
187-
}
183+
//if (aliceDataChannel != null)
184+
//{
185+
// SendResponseCreate(aliceDataChannel, OpenAIVoicesEnum.shimmer, "Only talk in cheesy puns. Keep it short once you'vegot you pun in. To start the conversation please repeat repeat this phrase in your corniest accent: 'You're a few tinnies short of a six-pack.'");
186+
//}
188187

189188
logger.LogInformation($"ctrl-c to exit..");
190189

@@ -208,27 +207,6 @@ from bobCtx in bobCallTask.Result
208207
_audioScopeForm?.Invoke(() => _audioScopeForm.Close());
209208
}
210209

211-
/// <summary>
212-
/// Sends a response create message to the OpenAI data channel to trigger the conversation.
213-
/// </summary>
214-
private static void SendResponseCreate(RTCDataChannel dc, OpenAIVoicesEnum voice, string message)
215-
{
216-
var responseCreate = new OpenAIResponseCreate
217-
{
218-
EventID = Guid.NewGuid().ToString(),
219-
Response = new OpenAIResponseCreateResponse
220-
{
221-
Instructions = message,
222-
Voice = voice.ToString()
223-
}
224-
};
225-
226-
logger.LogInformation($"Sending initial response create to first call data channel {dc.label}.");
227-
logger.LogDebug(responseCreate.ToJson());
228-
229-
dc.send(responseCreate.ToJson());
230-
}
231-
232210
/// <summary>
233211
/// Initiaites the creation and media session wiring for a local peer connection.
234212
/// </summary>
@@ -389,30 +367,80 @@ private static void OnDataChannelMessage(RTCDataChannel dc, DataChannelPayloadPr
389367

390368
//logger.LogDebug(message);
391369

392-
var serverEvent = JsonSerializer.Deserialize<OpenAIServerEventBase>(message, JsonOptions.Default);
370+
var serverEventModel = OpenAIDataChannelManager.ParseDataChannelMessage(data);
393371

394-
if (serverEvent != null)
372+
serverEventModel.IfSome(e =>
395373
{
396-
//logger.LogInformation($"Server event ID {serverEvent.EventID} and type {serverEvent.Type}.");
397-
398-
Option<OpenAIServerEventBase> serverEventModel = serverEvent.Type switch
374+
switch (e)
399375
{
400-
"response.audio_transcript.delta" => JsonSerializer.Deserialize<OpenAIResponseAudioTranscriptDelta>(message, JsonOptions.Default),
401-
"response.audio_transcript.done" => JsonSerializer.Deserialize<OpenAIResponseAudioTranscriptDone>(message, JsonOptions.Default),
402-
_ => Option<OpenAIServerEventBase>.None
403-
};
376+
case OpenAISessionCreated sessionCreated:
377+
//logger.LogInformation($"Session created: {sessionCreated.ToJson()}");
378+
OnSessionCreated(dc);
379+
if (callLabel == ALICE_CALL_LABEL)
380+
{
381+
SendResponseCreate(dc, OpenAIVoicesEnum.alloy, "Tell me your first Dad joke.");
382+
}
383+
break;
404384

405-
serverEventModel.IfSome(e =>
406-
{
407-
if (e is OpenAIResponseAudioTranscriptDone done)
408-
{
409-
logger.LogInformation($"Transcript done {callLabel}: {done.Transcript}");
410-
}
411-
});
412-
}
413-
else
385+
case OpenAISessionUpdated sessionUpdated:
386+
logger.LogInformation($"Session updated: {sessionUpdated.ToJson()}");
387+
break;
388+
389+
case OpenAIResponseAudioTranscriptDone transcriptionDone:
390+
logger.LogInformation($"Transcript done: {transcriptionDone.Transcript}");
391+
break;
392+
393+
default:
394+
//logger.LogInformation($"Data Channel {e.Type} message received.");
395+
break;
396+
}
397+
});
398+
399+
if (serverEventModel.IsNone)
414400
{
415401
logger.LogWarning($"Failed to parse server event for: {message}");
416402
}
417403
}
404+
405+
/// <summary>
406+
/// Sends a session update message to add the get weather demo function.
407+
/// </summary>
408+
private static void OnSessionCreated(RTCDataChannel dc)
409+
{
410+
var sessionUpdate = new OpenAISessionUpdate
411+
{
412+
EventID = Guid.NewGuid().ToString(),
413+
Session = new OpenAISession
414+
{
415+
Model = OPENAI_MODEL,
416+
Instructions = "You are a joke bot. Tell a Dad joke every chance you get.",
417+
}
418+
};
419+
420+
logger.LogInformation($"Sending OpenAI session update to data channel {dc.label}.");
421+
logger.LogDebug(sessionUpdate.ToJson());
422+
423+
dc.send(sessionUpdate.ToJson());
424+
}
425+
426+
/// <summary>
427+
/// Sends a response create message to the OpenAI data channel to trigger the conversation.
428+
/// </summary>
429+
private static void SendResponseCreate(RTCDataChannel dc, OpenAIVoicesEnum voice, string message)
430+
{
431+
var responseCreate = new OpenAIResponseCreate
432+
{
433+
EventID = Guid.NewGuid().ToString(),
434+
Response = new OpenAIResponseCreateResponse
435+
{
436+
Instructions = message,
437+
Voice = voice.ToString()
438+
}
439+
};
440+
441+
logger.LogInformation($"Sending initial response create to first call data channel {dc.label}.");
442+
logger.LogDebug(responseCreate.ToJson());
443+
444+
dc.send(responseCreate.ToJson());
445+
}
418446
}

0 commit comments

Comments
 (0)