StarmoonAI · manishEMS47 · Jun 9, 2026
diff --git a/.env.example b/.env.example
@@ -10,6 +10,10 @@ SPEECH_KEY=
 SPEECH_REGION=
 # Fish TTS
 FISH_API_KEY=
+# 60dB Text to Speech
+SIXTYDB_API_KEY=
+# Optional, defaults to https://api.60db.ai
+SIXTYDB_API_URL=
 # Deepgram Speech to Text
 DG_API_KEY=
 # Huggingface

diff --git a/backend/app/api/endpoints/live.py b/backend/app/api/endpoints/live.py
@@ -206,7 +206,7 @@ async def websocket_endpoint(websocket: WebSocket):
             language_code, title=title, subtitle=subtitle, trait=trait
         )
 
-        if tts_model == "AZURE":
+        if tts_model in ("AZURE", "SIXTYDB"):
             content += get_language_spoken_prompt_prefix(language_code)
         else:
             content += get_user_native_language_prompt_prefix(language_code)

diff --git a/backend/app/core/config.py b/backend/app/core/config.py
@@ -67,6 +67,12 @@ class Settings(BaseSettings):
     SPEECH_KEY: str = Field(default_factory=lambda: os.getenv("SPEECH_KEY", ""))
     SPEECH_REGION: str = Field(default_factory=lambda: os.getenv("SPEECH_REGION", ""))
     FISH_API_KEY: str = Field(default_factory=lambda: os.getenv("FISH_API_KEY", ""))
+    SIXTYDB_API_KEY: str = Field(
+        default_factory=lambda: os.getenv("SIXTYDB_API_KEY", "")
+    )
+    SIXTYDB_API_URL: str = Field(
+        default_factory=lambda: os.getenv("SIXTYDB_API_URL", "https://api.60db.ai")
+    )
 
     # Analytics
     HF_ACCESS_TOKEN: str = Field(

diff --git a/backend/app/services/tts.py b/backend/app/services/tts.py
@@ -11,6 +11,7 @@
 import emoji
 import nltk
 import ormsgpack
+import requests
 import websockets
 from app.celery.tasks import emotion_detection
 from app.core.config import settings
@@ -323,6 +324,85 @@ def fish_tts(
             )
 
 
+def sixtydb_tts(
+    sentence: str,
+    is_start: bool,
+    task_id_queue: asyncio.Queue,
+    tts_code: str,
+    device: str,
+    bytes_queue: asyncio.Queue,
+    user: dict,
+    personality_translation: dict,
+):
+    """60dB (api.60db.ai) text-to-speech.
+
+    Mirrors azure_tts / fish_tts: a blocking, per-sentence synthesis that
+    resamples to the device format (16kHz, 16-bit, mono PCM) and chunks the
+    audio onto bytes_queue with the same START / gap / response control flow.
+    """
+
+    task_id = create_emotion_detection_task(
+        f"{sentence}",
+        user,
+        personality_translation,
+        "assistant",
+        user["most_recent_chat_group_id"],
+    )
+
+    # POST /tts-synthesize -> JSON { success, audio_base64, sample_rate, ... }
+    # We request wav so pydub can read the header and resample reliably.
+    response = requests.post(
+        f"{settings.SIXTYDB_API_URL}/tts-synthesize",
+        headers={
+            "Authorization": f"Bearer {settings.SIXTYDB_API_KEY}",
+            "Content-Type": "application/json",
+        },
+        json={
+            "text": sentence.strip(),
+            "voice_id": tts_code,
+            "output_format": "wav",
+            "speed": 1,
+            "stability": 50,
+            "similarity": 75,
+            "enhance": True,
+        },
+        timeout=30,
+    )
+    response.raise_for_status()
+    result = response.json()
+
+    if not result.get("success") or not result.get("audio_base64"):
+        print("60dB TTS error:", result.get("message"))
+        return
+
+    audio_bytes = base64.b64decode(result["audio_base64"])
+
+    # Normalise to 16kHz mono PCM, the format the ESP32 / web client expect.
+    audio_segment = AudioSegment.from_file(BytesIO(audio_bytes), format="wav")
+    audio_segment = audio_segment.set_channels(1).set_frame_rate(16000)
+    audio_data = audio_segment.raw_data
+
+    if is_start:
+        enqueue_bytes(bytes_queue, device, "info", None, "START", "START")
+
+    if device == "web":
+        base64_audio = base64.b64encode(audio_data).decode("utf-8")
+        enqueue_bytes(
+            bytes_queue, device, "response", base64_audio, sentence, None, task_id
+        )
+        enqueue_task(task_id_queue, task_id)
+    else:
+        chunk_size = 1024  # Adjust this value based on your needs
+        for i in range(0, len(audio_data), chunk_size):
+            chunk = audio_data[i : i + chunk_size]
+            if i % 500 == 0:
+                enqueue_bytes(bytes_queue, device, "gap", None, None, None, None)
+
+            enqueue_bytes(
+                bytes_queue, device, "response", chunk, sentence, None, task_id
+            )
+
+
 async def stream_audio(audio_stream, bytes_queue: asyncio.Queue, user, device):
     """
     Stream audio data

diff --git a/backend/app/utils/ws_conv_manager.py b/backend/app/utils/ws_conv_manager.py
@@ -20,6 +20,7 @@
     check_task_result,
     create_emotion_detection_task,
     fish_tts,
+    sixtydb_tts,
     text_to_speech_stream,
 )
 from app.utils.enqueue import enqueue_bytes, enqueue_task
@@ -304,6 +305,39 @@ async def interrupt(self, websocket, response_task_azure, bytes_queue):
         # Reset the event after interrupting
         self.interrupt_event.clear()
 
+    def _dispatch_tts(
+        self,
+        sentence,
+        is_start,
+        task_id_queue,
+        voice_id,
+        bytes_queue,
+        user,
+        personality_translation,
+    ):
+        """Route a sentence to the TTS engine selected by self.tts_model.
+
+        Engines share the same signature and output contract (16kHz mono PCM
+        chunked onto bytes_queue), so adding one is just another branch here.
+        """
+        if self.tts_model == "AZURE":
+            engine = azure_tts
+        elif self.tts_model == "SIXTYDB":
+            engine = sixtydb_tts
+        else:
+            engine = fish_tts
+
+        engine(
+            sentence,
+            is_start,
+            task_id_queue,
+            voice_id,
+            self.device,
+            bytes_queue,
+            user,
+            personality_translation,
+        )
+
     def chat_completion(
         self,
         messages: list,
@@ -341,28 +375,15 @@ def chat_completion(
 
                     if len(sentences) > 1:
                         for sentence in sentences[:-1]:
-                            if self.tts_model == "AZURE":
-                                azure_tts(
-                                    sentence,
-                                    is_start,
-                                    task_id_queue,
-                                    voice_id,
-                                    self.device,
-                                    bytes_queue,
-                                    user,
-                                    personality_translation,
-                                )
-                            else:
-                                fish_tts(
-                                    sentence,
-                                    is_start,
-                                    task_id_queue,
-                                    voice_id,
-                                    self.device,
-                                    bytes_queue,
-                                    user,
-                                    personality_translation,
-                                )
+                            self._dispatch_tts(
+                                sentence,
+                                is_start,
+                                task_id_queue,
+                                voice_id,
+                                bytes_queue,
+                                user,
+                                personality_translation,
+                            )
 
                             if is_start == True:
                                 is_start = False
@@ -372,55 +393,29 @@ def chat_completion(
             if accumulated_text:
                 accumulated_text_ = "".join(accumulated_text)
                 print("Sentence:", accumulated_text_)
-                if self.tts_model == "AZURE":
-                    azure_tts(
-                        accumulated_text_,
-                        is_start,
-                        task_id_queue,
-                        voice_id,
-                        self.device,
-                        bytes_queue,
-                        user,
-                        personality_translation,
-                    )
-                else:
-                    fish_tts(
-                        accumulated_text_,
-                        is_start,
-                        task_id_queue,
-                        voice_id,
-                        self.device,
-                        bytes_queue,
-                        user,
-                        personality_translation,
-                    )
-
-        except Exception as e:
-            print("Error in chat_completion:", e)
-            response_text = get_error_prompt_prefix(self.language_code)
-            if self.tts_model == "AZURE":
-                azure_tts(
-                    response_text,
+                self._dispatch_tts(
+                    accumulated_text_,
                     is_start,
                     task_id_queue,
                     voice_id,
-                    self.device,
-                    bytes_queue,
-                    user,
-                    personality_translation,
-                )
-            else:
-                fish_tts(
-                    response_text,
-                    is_start,
-                    task_id_queue,
-                    voice_id,
-                    self.device,
                     bytes_queue,
                     user,
                     personality_translation,
                 )
 
+        except Exception as e:
+            print("Error in chat_completion:", e)
+            response_text = get_error_prompt_prefix(self.language_code)
+            self._dispatch_tts(
+                response_text,
+                is_start,
+                task_id_queue,
+                voice_id,
+                bytes_queue,
+                user,
+                personality_translation,
+            )
+
         enqueue_bytes(bytes_queue, self.device, "info", None, "END", "END")
 
         return response_text