diff --git a/.env.example b/.env.example index 3c3c2ad..83feb9c 100644 --- a/.env.example +++ b/.env.example @@ -10,6 +10,10 @@ SPEECH_KEY= SPEECH_REGION= # Fish TTS FISH_API_KEY= +# 60dB Text to Speech +SIXTYDB_API_KEY= +# Optional, defaults to https://api.60db.ai +SIXTYDB_API_URL= # Deepgram Speech to Text DG_API_KEY= # Huggingface diff --git a/backend/app/api/endpoints/live.py b/backend/app/api/endpoints/live.py index ef931ed..db66c85 100644 --- a/backend/app/api/endpoints/live.py +++ b/backend/app/api/endpoints/live.py @@ -206,7 +206,7 @@ async def websocket_endpoint(websocket: WebSocket): language_code, title=title, subtitle=subtitle, trait=trait ) - if tts_model == "AZURE": + if tts_model in ("AZURE", "SIXTYDB"): content += get_language_spoken_prompt_prefix(language_code) else: content += get_user_native_language_prompt_prefix(language_code) diff --git a/backend/app/core/config.py b/backend/app/core/config.py index 531aa69..2c3df3b 100644 --- a/backend/app/core/config.py +++ b/backend/app/core/config.py @@ -67,6 +67,12 @@ class Settings(BaseSettings): SPEECH_KEY: str = Field(default_factory=lambda: os.getenv("SPEECH_KEY", "")) SPEECH_REGION: str = Field(default_factory=lambda: os.getenv("SPEECH_REGION", "")) FISH_API_KEY: str = Field(default_factory=lambda: os.getenv("FISH_API_KEY", "")) + SIXTYDB_API_KEY: str = Field( + default_factory=lambda: os.getenv("SIXTYDB_API_KEY", "") + ) + SIXTYDB_API_URL: str = Field( + default_factory=lambda: os.getenv("SIXTYDB_API_URL", "https://api.60db.ai") + ) # Analytics HF_ACCESS_TOKEN: str = Field( diff --git a/backend/app/services/tts.py b/backend/app/services/tts.py index 5987874..257d42d 100644 --- a/backend/app/services/tts.py +++ b/backend/app/services/tts.py @@ -11,6 +11,7 @@ import emoji import nltk import ormsgpack +import requests import websockets from app.celery.tasks import emotion_detection from app.core.config import settings @@ -323,6 +324,85 @@ def fish_tts( ) +def sixtydb_tts( + sentence: str, + is_start: bool, + task_id_queue: asyncio.Queue, + tts_code: str, + device: str, + bytes_queue: asyncio.Queue, + user: dict, + personality_translation: dict, +): + """60dB (api.60db.ai) text-to-speech. + + Mirrors azure_tts / fish_tts: a blocking, per-sentence synthesis that + resamples to the device format (16kHz, 16-bit, mono PCM) and chunks the + audio onto bytes_queue with the same START / gap / response control flow. + """ + + task_id = create_emotion_detection_task( + f"{sentence}", + user, + personality_translation, + "assistant", + user["most_recent_chat_group_id"], + ) + + # POST /tts-synthesize -> JSON { success, audio_base64, sample_rate, ... } + # We request wav so pydub can read the header and resample reliably. + response = requests.post( + f"{settings.SIXTYDB_API_URL}/tts-synthesize", + headers={ + "Authorization": f"Bearer {settings.SIXTYDB_API_KEY}", + "Content-Type": "application/json", + }, + json={ + "text": sentence.strip(), + "voice_id": tts_code, + "output_format": "wav", + "speed": 1, + "stability": 50, + "similarity": 75, + "enhance": True, + }, + timeout=30, + ) + response.raise_for_status() + result = response.json() + + if not result.get("success") or not result.get("audio_base64"): + print("60dB TTS error:", result.get("message")) + return + + audio_bytes = base64.b64decode(result["audio_base64"]) + + # Normalise to 16kHz mono PCM, the format the ESP32 / web client expect. + audio_segment = AudioSegment.from_file(BytesIO(audio_bytes), format="wav") + audio_segment = audio_segment.set_channels(1).set_frame_rate(16000) + audio_data = audio_segment.raw_data + + if is_start: + enqueue_bytes(bytes_queue, device, "info", None, "START", "START") + + if device == "web": + base64_audio = base64.b64encode(audio_data).decode("utf-8") + enqueue_bytes( + bytes_queue, device, "response", base64_audio, sentence, None, task_id + ) + enqueue_task(task_id_queue, task_id) + else: + chunk_size = 1024 # Adjust this value based on your needs + for i in range(0, len(audio_data), chunk_size): + chunk = audio_data[i : i + chunk_size] + if i % 500 == 0: + enqueue_bytes(bytes_queue, device, "gap", None, None, None, None) + + enqueue_bytes( + bytes_queue, device, "response", chunk, sentence, None, task_id + ) + + async def stream_audio(audio_stream, bytes_queue: asyncio.Queue, user, device): """ Stream audio data diff --git a/backend/app/utils/ws_conv_manager.py b/backend/app/utils/ws_conv_manager.py index ccf46a5..317ffdc 100644 --- a/backend/app/utils/ws_conv_manager.py +++ b/backend/app/utils/ws_conv_manager.py @@ -20,6 +20,7 @@ check_task_result, create_emotion_detection_task, fish_tts, + sixtydb_tts, text_to_speech_stream, ) from app.utils.enqueue import enqueue_bytes, enqueue_task @@ -304,6 +305,39 @@ async def interrupt(self, websocket, response_task_azure, bytes_queue): # Reset the event after interrupting self.interrupt_event.clear() + def _dispatch_tts( + self, + sentence, + is_start, + task_id_queue, + voice_id, + bytes_queue, + user, + personality_translation, + ): + """Route a sentence to the TTS engine selected by self.tts_model. + + Engines share the same signature and output contract (16kHz mono PCM + chunked onto bytes_queue), so adding one is just another branch here. + """ + if self.tts_model == "AZURE": + engine = azure_tts + elif self.tts_model == "SIXTYDB": + engine = sixtydb_tts + else: + engine = fish_tts + + engine( + sentence, + is_start, + task_id_queue, + voice_id, + self.device, + bytes_queue, + user, + personality_translation, + ) + def chat_completion( self, messages: list, @@ -341,28 +375,15 @@ def chat_completion( if len(sentences) > 1: for sentence in sentences[:-1]: - if self.tts_model == "AZURE": - azure_tts( - sentence, - is_start, - task_id_queue, - voice_id, - self.device, - bytes_queue, - user, - personality_translation, - ) - else: - fish_tts( - sentence, - is_start, - task_id_queue, - voice_id, - self.device, - bytes_queue, - user, - personality_translation, - ) + self._dispatch_tts( + sentence, + is_start, + task_id_queue, + voice_id, + bytes_queue, + user, + personality_translation, + ) if is_start == True: is_start = False @@ -372,55 +393,29 @@ def chat_completion( if accumulated_text: accumulated_text_ = "".join(accumulated_text) print("Sentence:", accumulated_text_) - if self.tts_model == "AZURE": - azure_tts( - accumulated_text_, - is_start, - task_id_queue, - voice_id, - self.device, - bytes_queue, - user, - personality_translation, - ) - else: - fish_tts( - accumulated_text_, - is_start, - task_id_queue, - voice_id, - self.device, - bytes_queue, - user, - personality_translation, - ) - - except Exception as e: - print("Error in chat_completion:", e) - response_text = get_error_prompt_prefix(self.language_code) - if self.tts_model == "AZURE": - azure_tts( - response_text, + self._dispatch_tts( + accumulated_text_, is_start, task_id_queue, voice_id, - self.device, - bytes_queue, - user, - personality_translation, - ) - else: - fish_tts( - response_text, - is_start, - task_id_queue, - voice_id, - self.device, bytes_queue, user, personality_translation, ) + except Exception as e: + print("Error in chat_completion:", e) + response_text = get_error_prompt_prefix(self.language_code) + self._dispatch_tts( + response_text, + is_start, + task_id_queue, + voice_id, + bytes_queue, + user, + personality_translation, + ) + enqueue_bytes(bytes_queue, self.device, "info", None, "END", "END") return response_text diff --git a/backend/scripts/test_60db_tts.py b/backend/scripts/test_60db_tts.py new file mode 100644 index 0000000..aac86e2 --- /dev/null +++ b/backend/scripts/test_60db_tts.py @@ -0,0 +1,133 @@ +"""Standalone smoke test for the 60dB (api.60db.ai) TTS integration. + +Verifies your API key, lists your voices, and synthesizes one sentence to a +WAV file. Intentionally has NO app imports so it runs without loading torch / +Supabase / the full backend. + +Usage: + cd backend + # SIXTYDB_API_KEY must be set (via .env or the environment) + python scripts/test_60db_tts.py + python scripts/test_60db_tts.py --voice-id --text "Hello there" +""" + +import argparse +import base64 +import os +import sys + +import requests +from dotenv import load_dotenv + +load_dotenv() + +API_KEY = os.getenv("SIXTYDB_API_KEY", "") +API_URL = os.getenv("SIXTYDB_API_URL", "https://api.60db.ai") + + +def get_my_voices(): + """GET /myvoices -> list of voice objects.""" + resp = requests.get( + f"{API_URL}/myvoices", + headers={"Authorization": f"Bearer {API_KEY}"}, + timeout=30, + ) + resp.raise_for_status() + body = resp.json() + if not body.get("success"): + raise RuntimeError(f"/myvoices failed: {body.get('message')}") + return body.get("data", []) + + +def synthesize(text: str, voice_id: str, out_path: str): + """POST /tts-synthesize -> decode audio_base64 -> write wav.""" + payload = { + "text": text, + "output_format": "wav", + "speed": 1, + "stability": 50, + "similarity": 75, + "enhance": True, + } + if voice_id: + payload["voice_id"] = voice_id + + resp = requests.post( + f"{API_URL}/tts-synthesize", + headers={ + "Authorization": f"Bearer {API_KEY}", + "Content-Type": "application/json", + }, + json=payload, + timeout=60, + ) + resp.raise_for_status() + body = resp.json() + if not body.get("success") or not body.get("audio_base64"): + raise RuntimeError(f"/tts-synthesize failed: {body.get('message')}") + + audio = base64.b64decode(body["audio_base64"]) + with open(out_path, "wb") as f: + f.write(audio) + + return body + + +def main(): + parser = argparse.ArgumentParser(description="60dB TTS smoke test") + parser.add_argument( + "--voice-id", + default=None, + help="Voice to use. Defaults to the first voice from /myvoices.", + ) + parser.add_argument( + "--text", + default="Hi! This is a sixty d B text to speech smoke test.", + help="Text to synthesize.", + ) + parser.add_argument( + "--out", + default="sixtydb_test.wav", + help="Output WAV path.", + ) + args = parser.parse_args() + + if not API_KEY: + print("ERROR: SIXTYDB_API_KEY is not set (env or backend/.env).") + sys.exit(1) + + print(f"Base URL: {API_URL}") + + print("\n[1/2] Fetching your voices ...") + voices = get_my_voices() + if not voices: + print(" No voices found on this account.") + for v in voices: + labels = v.get("labels", {}) or {} + print( + f" - {v.get('voice_id')} " + f"name={v.get('name')!r} " + f"model={v.get('model')!r} " + f"lang={labels.get('language')} " + f"gender={labels.get('gender')}" + ) + + voice_id = args.voice_id + if not voice_id and voices: + voice_id = voices[0]["voice_id"] + print(f"\n Using first voice: {voice_id}") + + print(f"\n[2/2] Synthesizing: {args.text!r}") + meta = synthesize(args.text, voice_id, args.out) + print( + f" OK -> {args.out} " + f"(sample_rate={meta.get('sample_rate')}, " + f"duration={meta.get('duration_seconds')}s, " + f"format={meta.get('output_format')})" + ) + print("\nDone. This voice_id is what you'd store in toys.tts_code " + "with toys.tts_model = 'SIXTYDB'.") + + +if __name__ == "__main__": + main()