handy_scripts_CLIAI/stt_video_using_speechmatics.sh at main · CLIAI/handy_scripts_CLIAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#!/bin/bash

BACKEND="speechmatics"

if ! command -v stt_speechmatics.py &> /dev/null
then
    echo 'The script stt_speechmatics.py is required to run this program.
    It is not currently in your PATH.
    Please ensure that it is available.
    One way to do this is by cloning the repository
    https://github.com/CLIAI/handy_scripts
    into a directory in your PATH.' >&2
    exit 1
fi

# Parse flags
STATUS_MODE=false
JSON_OUTPUT=false
POSITIONAL_ARGS=()

while [[ $# -gt 0 ]]; do
  case $1 in
    --status)
      STATUS_MODE=true
      shift
      ;;
    --json)
      JSON_OUTPUT=true
      shift
      ;;
    -h|--help)
      echo "Usage: $0 [--status] [--json] video_file [max_speakers [language_code]]" >&2
      echo >&2
      echo "Options:" >&2
      echo "  --status    Check if transcript exists without processing" >&2
      echo "  --json      Output status in JSON format" >&2
      echo >&2
      echo "Arguments:" >&2
      echo "  video_file: The path to the video file you want to transcribe." >&2
      echo "  max_speakers: The maximum number of speakers in the video." >&2
      echo "  language_code: Language code (default: en). Examples: de, fr, es, ja" >&2
      exit 0
      ;;
    *)
      POSITIONAL_ARGS+=("$1")
      shift
      ;;
  esac
done

set -- "${POSITIONAL_ARGS[@]}"

if [ $# -eq 0 ]; then
  echo "Usage: $0 [--status] [--json] video_file [max_speakers [language_code]]" >&2
  exit 1
fi

VIDEO="$1"
if [ ! -f "$VIDEO" ]; then
  if [ "$STATUS_MODE" = true ]; then
    if [ "$JSON_OUTPUT" = true ]; then
      echo "{\"audio_path\": \"$VIDEO\", \"audio_exists\": false, \"mp3_path\": null, \"mp3_exists\": false, \"transcript_path\": null, \"transcript_exists\": false, \"backend\": \"$BACKEND\"}"
    else
      echo "Audio: $VIDEO (not found)"
      echo "MP3: not found"
      echo "Transcript: not found"
    fi
    exit 0
  else
    echo "Video file $VIDEO does not exist." >&2
    exit 1
  fi
fi

MP3="$VIDEO".mp3
# New naming convention: include backend suffix
TXT_NEW="$MP3"."$BACKEND".txt
# Legacy naming for backward compatibility
TXT_LEGACY="$MP3".txt

# Determine which transcript file to use (prefer new, fallback to legacy)
if [ -f "$TXT_NEW" ]; then
  TXT="$TXT_NEW"
  TXT_IS_LEGACY=false
elif [ -f "$TXT_LEGACY" ]; then
  TXT="$TXT_LEGACY"
  TXT_IS_LEGACY=true
else
  TXT="$TXT_NEW"
  TXT_IS_LEGACY=false
fi

# Status mode: just report what exists
if [ "$STATUS_MODE" = true ]; then
  MP3_EXISTS=false
  MP3_SIZE=0
  if [ -f "$MP3" ]; then
    MP3_EXISTS=true
    MP3_SIZE=$(stat -c%s "$MP3" 2>/dev/null || stat -f%z "$MP3" 2>/dev/null || echo 0)
  fi

  TXT_EXISTS=false
  TXT_SIZE=0
  TXT_PATH="$TXT_NEW"
  if [ -f "$TXT_NEW" ]; then
    TXT_EXISTS=true
    TXT_SIZE=$(stat -c%s "$TXT_NEW" 2>/dev/null || stat -f%z "$TXT_NEW" 2>/dev/null || echo 0)
    TXT_PATH="$TXT_NEW"
    TXT_IS_LEGACY=false
  elif [ -f "$TXT_LEGACY" ]; then
    TXT_EXISTS=true
    TXT_SIZE=$(stat -c%s "$TXT_LEGACY" 2>/dev/null || stat -f%z "$TXT_LEGACY" 2>/dev/null || echo 0)
    TXT_PATH="$TXT_LEGACY"
    TXT_IS_LEGACY=true
  fi

  if [ "$JSON_OUTPUT" = true ]; then
    cat <<EOF
{
  "audio_path": "$VIDEO",
  "audio_exists": true,
  "mp3_path": "$MP3",
  "mp3_exists": $MP3_EXISTS,
  "mp3_size_bytes": $MP3_SIZE,
  "transcript_path": "$TXT_PATH",
  "transcript_exists": $TXT_EXISTS,
  "transcript_size_bytes": $TXT_SIZE,
  "transcript_legacy": $TXT_IS_LEGACY,
  "backend": "$BACKEND"
}
EOF
  else
    echo "Audio: $VIDEO"
    if [ "$MP3_EXISTS" = true ]; then
      echo "MP3: exists ($MP3) - $(numfmt --to=iec $MP3_SIZE 2>/dev/null || echo "$MP3_SIZE bytes")"
    else
      echo "MP3: not found"
    fi
    if [ "$TXT_EXISTS" = true ]; then
      LEGACY_NOTE=""
      if [ "$TXT_IS_LEGACY" = true ]; then
        LEGACY_NOTE=" [legacy naming]"
      fi
      echo "Transcript: exists ($TXT_PATH)$LEGACY_NOTE - $(numfmt --to=iec $TXT_SIZE 2>/dev/null || echo "$TXT_SIZE bytes")"
    else
      echo "Transcript: not found"
    fi
  fi
  exit 0
fi

# Normal transcription mode
max_speakers="${2:-}"
if [ -z "$max_speakers" ]; then
  read -p 'Max speakers [0] (0==any):' max_speakers
  max_speakers="${max_speakers:-0}"
fi

LANGUAGE="${3:-}"
if [ -z "$LANGUAGE" ]; then
  read -p 'Language code [en]:' LANGUAGE
  LANGUAGE="${LANGUAGE:-en}"
fi

function extract_mp3() {
  local video="$1"
  local MP3="$2"
  if [ ! -f "$MP3" ]; then
    set -x
    ffmpeg -i "$video" -vn -ab 128k -ar 44100 -y "$MP3"
    set +x
  else
    echo "File $MP3 already exists." >&2
  fi
}

function transcribe_with_diarization() {
  local max_speakers="$1"
  local LANGUAGE="$2"
  local MP3="$3"
  local TXT="$4"

  # Always call stt_speechmatics.py - it will handle idempotence internally
  if [ "$max_speakers" -eq 1 ]; then
    set -x
    stt_speechmatics.py -l "$LANGUAGE" --operating-point enhanced -o "$TXT" "$MP3"
  elif [ "$max_speakers" -ne 0 ]; then
    set -x
    stt_speechmatics.py -l "$LANGUAGE" --operating-point enhanced -d --max-speakers "$max_speakers" -o "$TXT" "$MP3"
  else
    set -x
    stt_speechmatics.py -l "$LANGUAGE" --operating-point enhanced -d -o "$TXT" "$MP3"
  fi
  set +x
}

# Extract MP3 if needed
extract_mp3 "$VIDEO" "$MP3"

# Use new naming convention for output
TXT="$TXT_NEW"

# Transcribe using Speechmatics
# stt_speechmatics.py will handle idempotence - if transcript exists, it will display it
transcribe_with_diarization "$max_speakers" "$LANGUAGE" "$MP3" "$TXT"

# Display a message indicating completion
if [ -f "$TXT" ]; then
  echo "Transcript is available at: $TXT" >&2

  # Only print the transcript if it wasn't already printed by stt_speechmatics.py
  # We can check if the transcript was just created by comparing timestamps
  if [ -s "$TXT" ] && [ "$TXT" -ot "$MP3" ]; then
    cat "$TXT"
  fi
fi