-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstt_video_using_speechmatics.sh
More file actions
executable file
·216 lines (194 loc) · 5.63 KB
/
stt_video_using_speechmatics.sh
File metadata and controls
executable file
·216 lines (194 loc) · 5.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/bin/bash
BACKEND="speechmatics"
if ! command -v stt_speechmatics.py &> /dev/null
then
echo 'The script stt_speechmatics.py is required to run this program.
It is not currently in your PATH.
Please ensure that it is available.
One way to do this is by cloning the repository
https://github.com/CLIAI/handy_scripts
into a directory in your PATH.' >&2
exit 1
fi
# Parse flags
STATUS_MODE=false
JSON_OUTPUT=false
POSITIONAL_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--status)
STATUS_MODE=true
shift
;;
--json)
JSON_OUTPUT=true
shift
;;
-h|--help)
echo "Usage: $0 [--status] [--json] video_file [max_speakers [language_code]]" >&2
echo >&2
echo "Options:" >&2
echo " --status Check if transcript exists without processing" >&2
echo " --json Output status in JSON format" >&2
echo >&2
echo "Arguments:" >&2
echo " video_file: The path to the video file you want to transcribe." >&2
echo " max_speakers: The maximum number of speakers in the video." >&2
echo " language_code: Language code (default: en). Examples: de, fr, es, ja" >&2
exit 0
;;
*)
POSITIONAL_ARGS+=("$1")
shift
;;
esac
done
set -- "${POSITIONAL_ARGS[@]}"
if [ $# -eq 0 ]; then
echo "Usage: $0 [--status] [--json] video_file [max_speakers [language_code]]" >&2
exit 1
fi
VIDEO="$1"
if [ ! -f "$VIDEO" ]; then
if [ "$STATUS_MODE" = true ]; then
if [ "$JSON_OUTPUT" = true ]; then
echo "{\"audio_path\": \"$VIDEO\", \"audio_exists\": false, \"mp3_path\": null, \"mp3_exists\": false, \"transcript_path\": null, \"transcript_exists\": false, \"backend\": \"$BACKEND\"}"
else
echo "Audio: $VIDEO (not found)"
echo "MP3: not found"
echo "Transcript: not found"
fi
exit 0
else
echo "Video file $VIDEO does not exist." >&2
exit 1
fi
fi
MP3="$VIDEO".mp3
# New naming convention: include backend suffix
TXT_NEW="$MP3"."$BACKEND".txt
# Legacy naming for backward compatibility
TXT_LEGACY="$MP3".txt
# Determine which transcript file to use (prefer new, fallback to legacy)
if [ -f "$TXT_NEW" ]; then
TXT="$TXT_NEW"
TXT_IS_LEGACY=false
elif [ -f "$TXT_LEGACY" ]; then
TXT="$TXT_LEGACY"
TXT_IS_LEGACY=true
else
TXT="$TXT_NEW"
TXT_IS_LEGACY=false
fi
# Status mode: just report what exists
if [ "$STATUS_MODE" = true ]; then
MP3_EXISTS=false
MP3_SIZE=0
if [ -f "$MP3" ]; then
MP3_EXISTS=true
MP3_SIZE=$(stat -c%s "$MP3" 2>/dev/null || stat -f%z "$MP3" 2>/dev/null || echo 0)
fi
TXT_EXISTS=false
TXT_SIZE=0
TXT_PATH="$TXT_NEW"
if [ -f "$TXT_NEW" ]; then
TXT_EXISTS=true
TXT_SIZE=$(stat -c%s "$TXT_NEW" 2>/dev/null || stat -f%z "$TXT_NEW" 2>/dev/null || echo 0)
TXT_PATH="$TXT_NEW"
TXT_IS_LEGACY=false
elif [ -f "$TXT_LEGACY" ]; then
TXT_EXISTS=true
TXT_SIZE=$(stat -c%s "$TXT_LEGACY" 2>/dev/null || stat -f%z "$TXT_LEGACY" 2>/dev/null || echo 0)
TXT_PATH="$TXT_LEGACY"
TXT_IS_LEGACY=true
fi
if [ "$JSON_OUTPUT" = true ]; then
cat <<EOF
{
"audio_path": "$VIDEO",
"audio_exists": true,
"mp3_path": "$MP3",
"mp3_exists": $MP3_EXISTS,
"mp3_size_bytes": $MP3_SIZE,
"transcript_path": "$TXT_PATH",
"transcript_exists": $TXT_EXISTS,
"transcript_size_bytes": $TXT_SIZE,
"transcript_legacy": $TXT_IS_LEGACY,
"backend": "$BACKEND"
}
EOF
else
echo "Audio: $VIDEO"
if [ "$MP3_EXISTS" = true ]; then
echo "MP3: exists ($MP3) - $(numfmt --to=iec $MP3_SIZE 2>/dev/null || echo "$MP3_SIZE bytes")"
else
echo "MP3: not found"
fi
if [ "$TXT_EXISTS" = true ]; then
LEGACY_NOTE=""
if [ "$TXT_IS_LEGACY" = true ]; then
LEGACY_NOTE=" [legacy naming]"
fi
echo "Transcript: exists ($TXT_PATH)$LEGACY_NOTE - $(numfmt --to=iec $TXT_SIZE 2>/dev/null || echo "$TXT_SIZE bytes")"
else
echo "Transcript: not found"
fi
fi
exit 0
fi
# Normal transcription mode
max_speakers="${2:-}"
if [ -z "$max_speakers" ]; then
read -p 'Max speakers [0] (0==any):' max_speakers
max_speakers="${max_speakers:-0}"
fi
LANGUAGE="${3:-}"
if [ -z "$LANGUAGE" ]; then
read -p 'Language code [en]:' LANGUAGE
LANGUAGE="${LANGUAGE:-en}"
fi
function extract_mp3() {
local video="$1"
local MP3="$2"
if [ ! -f "$MP3" ]; then
set -x
ffmpeg -i "$video" -vn -ab 128k -ar 44100 -y "$MP3"
set +x
else
echo "File $MP3 already exists." >&2
fi
}
function transcribe_with_diarization() {
local max_speakers="$1"
local LANGUAGE="$2"
local MP3="$3"
local TXT="$4"
# Always call stt_speechmatics.py - it will handle idempotence internally
if [ "$max_speakers" -eq 1 ]; then
set -x
stt_speechmatics.py -l "$LANGUAGE" --operating-point enhanced -o "$TXT" "$MP3"
elif [ "$max_speakers" -ne 0 ]; then
set -x
stt_speechmatics.py -l "$LANGUAGE" --operating-point enhanced -d --max-speakers "$max_speakers" -o "$TXT" "$MP3"
else
set -x
stt_speechmatics.py -l "$LANGUAGE" --operating-point enhanced -d -o "$TXT" "$MP3"
fi
set +x
}
# Extract MP3 if needed
extract_mp3 "$VIDEO" "$MP3"
# Use new naming convention for output
TXT="$TXT_NEW"
# Transcribe using Speechmatics
# stt_speechmatics.py will handle idempotence - if transcript exists, it will display it
transcribe_with_diarization "$max_speakers" "$LANGUAGE" "$MP3" "$TXT"
# Display a message indicating completion
if [ -f "$TXT" ]; then
echo "Transcript is available at: $TXT" >&2
# Only print the transcript if it wasn't already printed by stt_speechmatics.py
# We can check if the transcript was just created by comparing timestamps
if [ -s "$TXT" ] && [ "$TXT" -ot "$MP3" ]; then
cat "$TXT"
fi
fi