-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathopenai_stt_cli.py
More file actions
executable file
·155 lines (126 loc) · 5.07 KB
/
openai_stt_cli.py
File metadata and controls
executable file
·155 lines (126 loc) · 5.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env -S uv run
# /// script
# dependencies = [
# "numpy>=1.24",
# "soundfile>=0.12",
# "sounddevice>=0.4",
# "prompt_toolkit>=3.0",
# "openai>=1.0",
# ]
# requires-python = ">=3.11"
# ///
import os
import queue
import tempfile
import time
import argparse
import shutil
import subprocess
import numpy as np
from openai import OpenAI
# Argument parsing
parser = argparse.ArgumentParser(description='OpenAI STT/ASR CLI (Voice to Text)')
parser.add_argument('-s', '--silent', action='store_true', help='Run in silent mode without prompt')
parser.add_argument('-o', '--output', type=str, help='Specify output file for transcript')
parser.add_argument('-a', '--append', type=str, help='Specify output file to append transcript')
parser.add_argument('-c', '--clipboard', action='store_true', help='Copy result to clipboard using xclip')
args = parser.parse_args()
client = OpenAI()
try:
import soundfile as sf
except (OSError, ModuleNotFoundError):
sf = None
from prompt_toolkit.shortcuts import prompt
# from .dump import dump # noqa: F401
class SoundDeviceError(Exception):
pass
class Voice:
max_rms = 0
min_rms = 1e5
pct = 0
threshold = 0.15
def __init__(self):
if sf is None:
print("The soundfile module is not available. Please install it using 'pip install soundfile'.")
raise SoundDeviceError("The soundfile module is required but not installed.")
try:
if not args.silent:
print("Initializing sound device...")
import sounddevice as sd
self.sd = sd
except Exception as e:
print(f"An error occurred while initializing the sound device: {e}")
raise SoundDeviceError("Failed to initialize the sound device.")
def callback(self, indata, frames, time, status):
"""This is called (from a separate thread) for each audio block."""
rms = np.sqrt(np.mean(indata**2))
self.max_rms = max(self.max_rms, rms)
self.min_rms = min(self.min_rms, rms)
rng = self.max_rms - self.min_rms
if rng > 0.001:
self.pct = (rms - self.min_rms) / rng
else:
self.pct = 0.5
self.q.put(indata.copy())
def get_prompt(self):
num = 10
if np.isnan(self.pct) or self.pct < self.threshold:
cnt = 0
else:
cnt = int(self.pct * 10)
#bar = "░" * cnt + "█" * (num - cnt)
bar = "#" * cnt + "_" * (num - cnt)
bar = bar[:num]
dur = time.time() - self.start_time
return f"Recording, press ENTER when done... {dur:.1f}sec {bar}"
# This block is responsible for handling the KeyboardInterrupt exception
# which is triggered by pressing [Ctrl]+[C]. When caught, it simply returns
# from the function without proceeding to the transcription step.
def raw_record_and_transcribe(self, history, language):
self.q = queue.Queue()
filename = tempfile.mktemp(suffix=".wav")
sample_rate = 16000 # 16kHz
self.start_time = time.time()
with self.sd.InputStream(samplerate=sample_rate, channels=1, callback=self.callback):
if not args.silent:
prompt(self.get_prompt, refresh_interval=0.1)
else:
#input("Press ENTER to stop recording...")
input("") # silently wait for ENTER in silent mode.
# Only proceed with transcription if there are audio frames in the queue
if not self.q.empty():
with sf.SoundFile(filename, mode="x", samplerate=sample_rate, channels=1) as file:
while not self.q.empty():
file.write(self.q.get())
with open(filename, "rb") as fh:
transcript = client.audio.transcriptions.create(model="whisper-1", file=fh)
transcript_text = transcript.text
else:
transcript_text = "No audio recorded."
return transcript_text
if __name__ == "__main__":
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise ValueError("Please set the OPENAI_API_KEY environment variable.")
# print(Voice().record_and_transcribe()) #original line
try:
transcript = Voice().raw_record_and_transcribe(history="", language="en")
# Handle output based on the silent mode and output file argument
except KeyboardInterrupt:
print("\nRecording interrupted by user.")
exit(0)
if args.clipboard:
if shutil.which('xclip'):
transcript_trimmed = transcript.strip()
subprocess.run("xclip -selection clipboard", input=transcript_trimmed, text=True, shell=True)
else:
print("xclip is not available. Please install xclip or use a different method to copy to clipboard.")
exit(1)
if args.output or args.append:
transcript = transcript.rstrip('\n') + '\n'
file_mode = 'a' if args.append else 'w'
output_file = args.append if args.append else args.output
with open(output_file, file_mode) as f:
f.write(transcript)
else:
print(transcript)