handy_scripts_CLIAI/openai_stt_cli.py at main · CLIAI/handy_scripts_CLIAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env -S uv run
# /// script
# dependencies = [
#   "numpy>=1.24",
#   "soundfile>=0.12",
#   "sounddevice>=0.4",
#   "prompt_toolkit>=3.0",
#   "openai>=1.0",
# ]
# requires-python = ">=3.11"
# ///
import os
import queue
import tempfile
import time
import argparse
import shutil
import subprocess
import numpy as np
from openai import OpenAI

# Argument parsing
parser = argparse.ArgumentParser(description='OpenAI STT/ASR CLI (Voice to Text)')
parser.add_argument('-s', '--silent', action='store_true', help='Run in silent mode without prompt')
parser.add_argument('-o', '--output', type=str, help='Specify output file for transcript')
parser.add_argument('-a', '--append', type=str, help='Specify output file to append transcript')
parser.add_argument('-c', '--clipboard', action='store_true', help='Copy result to clipboard using xclip')
args = parser.parse_args()

client = OpenAI()

try:
    import soundfile as sf
except (OSError, ModuleNotFoundError):
    sf = None

from prompt_toolkit.shortcuts import prompt

# from .dump import dump  # noqa: F401


class SoundDeviceError(Exception):
    pass


class Voice:
    max_rms = 0
    min_rms = 1e5
    pct = 0

    threshold = 0.15

    def __init__(self):
        if sf is None:
            print("The soundfile module is not available. Please install it using 'pip install soundfile'.")
            raise SoundDeviceError("The soundfile module is required but not installed.")
        try:
            if not args.silent:
                print("Initializing sound device...")
            import sounddevice as sd
            self.sd = sd
        except Exception as e:
            print(f"An error occurred while initializing the sound device: {e}")
            raise SoundDeviceError("Failed to initialize the sound device.")

    def callback(self, indata, frames, time, status):
        """This is called (from a separate thread) for each audio block."""
        rms = np.sqrt(np.mean(indata**2))
        self.max_rms = max(self.max_rms, rms)
        self.min_rms = min(self.min_rms, rms)

        rng = self.max_rms - self.min_rms
        if rng > 0.001:
            self.pct = (rms - self.min_rms) / rng
        else:
            self.pct = 0.5

        self.q.put(indata.copy())

    def get_prompt(self):
        num = 10
        if np.isnan(self.pct) or self.pct < self.threshold:
            cnt = 0
        else:
            cnt = int(self.pct * 10)

        #bar = "░" * cnt + "█" * (num - cnt)
        bar = "#" * cnt + "_" * (num - cnt)
        bar = bar[:num]

        dur = time.time() - self.start_time
        return f"Recording, press ENTER when done... {dur:.1f}sec {bar}"

    # This block is responsible for handling the KeyboardInterrupt exception
    # which is triggered by pressing [Ctrl]+[C]. When caught, it simply returns
    # from the function without proceeding to the transcription step.

    def raw_record_and_transcribe(self, history, language):
        self.q = queue.Queue()

        filename = tempfile.mktemp(suffix=".wav")

        sample_rate = 16000  # 16kHz

        self.start_time = time.time()

        with self.sd.InputStream(samplerate=sample_rate, channels=1, callback=self.callback):
            if not args.silent:
                prompt(self.get_prompt, refresh_interval=0.1)
            else:
                #input("Press ENTER to stop recording...")
                input("") # silently wait for ENTER in silent mode.

        # Only proceed with transcription if there are audio frames in the queue
        if not self.q.empty():
            with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
                while not self.q.empty():
                    file.write(self.q.get())

            with open(filename, "rb") as fh:
                transcript = client.audio.transcriptions.create(model="whisper-1", file=fh)
            transcript_text = transcript.text
        else:
            transcript_text = "No audio recorded."

        return transcript_text


if __name__ == "__main__":
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise ValueError("Please set the OPENAI_API_KEY environment variable.")
    # print(Voice().record_and_transcribe()) #original line
    try:
        transcript = Voice().raw_record_and_transcribe(history="", language="en")
        # Handle output based on the silent mode and output file argument
    except KeyboardInterrupt:
        print("\nRecording interrupted by user.")
        exit(0)
    if args.clipboard:
        if shutil.which('xclip'):
            transcript_trimmed = transcript.strip()
            subprocess.run("xclip -selection clipboard", input=transcript_trimmed, text=True, shell=True)

        else:
            print("xclip is not available. Please install xclip or use a different method to copy to clipboard.")
            exit(1)
    if args.output or args.append:
        transcript = transcript.rstrip('\n') + '\n'
        file_mode = 'a' if args.append else 'w'
        output_file = args.append if args.append else args.output
        with open(output_file, file_mode) as f:
            f.write(transcript)
    else:
        print(transcript)