Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ SPEECH_KEY=
SPEECH_REGION=
# Fish TTS
FISH_API_KEY=
# 60dB Text to Speech
SIXTYDB_API_KEY=
# Optional, defaults to https://api.60db.ai
SIXTYDB_API_URL=
# Deepgram Speech to Text
DG_API_KEY=
# Huggingface
Expand Down
2 changes: 1 addition & 1 deletion backend/app/api/endpoints/live.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ async def websocket_endpoint(websocket: WebSocket):
language_code, title=title, subtitle=subtitle, trait=trait
)

if tts_model == "AZURE":
if tts_model in ("AZURE", "SIXTYDB"):
content += get_language_spoken_prompt_prefix(language_code)
else:
content += get_user_native_language_prompt_prefix(language_code)
Expand Down
6 changes: 6 additions & 0 deletions backend/app/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ class Settings(BaseSettings):
SPEECH_KEY: str = Field(default_factory=lambda: os.getenv("SPEECH_KEY", ""))
SPEECH_REGION: str = Field(default_factory=lambda: os.getenv("SPEECH_REGION", ""))
FISH_API_KEY: str = Field(default_factory=lambda: os.getenv("FISH_API_KEY", ""))
SIXTYDB_API_KEY: str = Field(
default_factory=lambda: os.getenv("SIXTYDB_API_KEY", "")
)
SIXTYDB_API_URL: str = Field(
default_factory=lambda: os.getenv("SIXTYDB_API_URL", "https://api.60db.ai")
)

# Analytics
HF_ACCESS_TOKEN: str = Field(
Expand Down
80 changes: 80 additions & 0 deletions backend/app/services/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import emoji
import nltk
import ormsgpack
import requests
import websockets
from app.celery.tasks import emotion_detection
from app.core.config import settings
Expand Down Expand Up @@ -323,6 +324,85 @@ def fish_tts(
)


def sixtydb_tts(
sentence: str,
is_start: bool,
task_id_queue: asyncio.Queue,
tts_code: str,
device: str,
bytes_queue: asyncio.Queue,
user: dict,
personality_translation: dict,
):
"""60dB (api.60db.ai) text-to-speech.

Mirrors azure_tts / fish_tts: a blocking, per-sentence synthesis that
resamples to the device format (16kHz, 16-bit, mono PCM) and chunks the
audio onto bytes_queue with the same START / gap / response control flow.
"""

task_id = create_emotion_detection_task(
f"{sentence}",
user,
personality_translation,
"assistant",
user["most_recent_chat_group_id"],
)

# POST /tts-synthesize -> JSON { success, audio_base64, sample_rate, ... }
# We request wav so pydub can read the header and resample reliably.
response = requests.post(
f"{settings.SIXTYDB_API_URL}/tts-synthesize",
headers={
"Authorization": f"Bearer {settings.SIXTYDB_API_KEY}",
"Content-Type": "application/json",
},
json={
"text": sentence.strip(),
"voice_id": tts_code,
"output_format": "wav",
"speed": 1,
"stability": 50,
"similarity": 75,
"enhance": True,
},
timeout=30,
)
response.raise_for_status()
result = response.json()

if not result.get("success") or not result.get("audio_base64"):
print("60dB TTS error:", result.get("message"))
return

audio_bytes = base64.b64decode(result["audio_base64"])

# Normalise to 16kHz mono PCM, the format the ESP32 / web client expect.
audio_segment = AudioSegment.from_file(BytesIO(audio_bytes), format="wav")
audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
audio_data = audio_segment.raw_data

if is_start:
enqueue_bytes(bytes_queue, device, "info", None, "START", "START")

if device == "web":
base64_audio = base64.b64encode(audio_data).decode("utf-8")
enqueue_bytes(
bytes_queue, device, "response", base64_audio, sentence, None, task_id
)
enqueue_task(task_id_queue, task_id)
else:
chunk_size = 1024 # Adjust this value based on your needs
for i in range(0, len(audio_data), chunk_size):
chunk = audio_data[i : i + chunk_size]
if i % 500 == 0:
enqueue_bytes(bytes_queue, device, "gap", None, None, None, None)

enqueue_bytes(
bytes_queue, device, "response", chunk, sentence, None, task_id
)


async def stream_audio(audio_stream, bytes_queue: asyncio.Queue, user, device):
"""
Stream audio data
Expand Down
121 changes: 58 additions & 63 deletions backend/app/utils/ws_conv_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
check_task_result,
create_emotion_detection_task,
fish_tts,
sixtydb_tts,
text_to_speech_stream,
)
from app.utils.enqueue import enqueue_bytes, enqueue_task
Expand Down Expand Up @@ -304,6 +305,39 @@ async def interrupt(self, websocket, response_task_azure, bytes_queue):
# Reset the event after interrupting
self.interrupt_event.clear()

def _dispatch_tts(
self,
sentence,
is_start,
task_id_queue,
voice_id,
bytes_queue,
user,
personality_translation,
):
"""Route a sentence to the TTS engine selected by self.tts_model.

Engines share the same signature and output contract (16kHz mono PCM
chunked onto bytes_queue), so adding one is just another branch here.
"""
if self.tts_model == "AZURE":
engine = azure_tts
elif self.tts_model == "SIXTYDB":
engine = sixtydb_tts
else:
engine = fish_tts

engine(
sentence,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)

def chat_completion(
self,
messages: list,
Expand Down Expand Up @@ -341,28 +375,15 @@ def chat_completion(

if len(sentences) > 1:
for sentence in sentences[:-1]:
if self.tts_model == "AZURE":
azure_tts(
sentence,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)
else:
fish_tts(
sentence,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)
self._dispatch_tts(
sentence,
is_start,
task_id_queue,
voice_id,
bytes_queue,
user,
personality_translation,
)

if is_start == True:
is_start = False
Expand All @@ -372,55 +393,29 @@ def chat_completion(
if accumulated_text:
accumulated_text_ = "".join(accumulated_text)
print("Sentence:", accumulated_text_)
if self.tts_model == "AZURE":
azure_tts(
accumulated_text_,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)
else:
fish_tts(
accumulated_text_,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)

except Exception as e:
print("Error in chat_completion:", e)
response_text = get_error_prompt_prefix(self.language_code)
if self.tts_model == "AZURE":
azure_tts(
response_text,
self._dispatch_tts(
accumulated_text_,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)
else:
fish_tts(
response_text,
is_start,
task_id_queue,
voice_id,
self.device,
bytes_queue,
user,
personality_translation,
)

except Exception as e:
print("Error in chat_completion:", e)
response_text = get_error_prompt_prefix(self.language_code)
self._dispatch_tts(
response_text,
is_start,
task_id_queue,
voice_id,
bytes_queue,
user,
personality_translation,
)

enqueue_bytes(bytes_queue, self.device, "info", None, "END", "END")

return response_text
Expand Down
Loading