qqwjq1981 commited on
Commit
2e011e4
·
verified ·
1 Parent(s): 9ecc376

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -22
app.py CHANGED
@@ -137,28 +137,32 @@ def transcribe_video_with_speakers(video_path):
137
  # Set up device
138
  device = "cuda" if torch.cuda.is_available() else "cpu"
139
  logger.info(f"Using device: {device}")
140
-
141
- # Load WhisperX model
142
- model = whisperx.load_model("large-v2", device)
143
- logger.info("WhisperX model loaded")
144
-
145
- # Transcribe with WhisperX
146
- result = model.transcribe(audio_path)
147
- logger.info("Audio transcription completed")
148
-
149
- # Align transcription
150
- model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
151
- result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
152
- logger.info("Transcription alignment completed")
153
-
154
- # Perform speaker diarization
155
- diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
156
- diarize_segments = diarize_model(audio_path)
157
- logger.info("Speaker diarization completed")
158
-
159
- # Assign speakers to transcribed segments
160
- result = whisperx.assign_word_speakers(diarize_segments, result)
161
- logger.info("Speakers assigned to transcribed segments")
 
 
 
 
162
 
163
  # Extract timestamps, text, and speaker IDs
164
  transcript_with_speakers = [
 
137
  # Set up device
138
  device = "cuda" if torch.cuda.is_available() else "cpu"
139
  logger.info(f"Using device: {device}")
140
+
141
+ try:
142
+ # Load a medium model with float32 for broader compatibility
143
+ model = whisperx.load_model("medium", device=device, compute_type="float32")
144
+ logger.info("WhisperX model loaded")
145
+
146
+ # Transcribe
147
+ result = model.transcribe(audio_path)
148
+ logger.info("Audio transcription completed")
149
+
150
+ # Alignment
151
+ model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
152
+ result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
153
+ logger.info("Transcription alignment completed")
154
+
155
+ # Diarization (works independently of Whisper model size)
156
+ diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
157
+ diarize_segments = diarize_model(audio_path)
158
+ logger.info("Speaker diarization completed")
159
+
160
+ # Assign speakers
161
+ result = whisperx.assign_word_speakers(diarize_segments, result)
162
+ logger.info("Speakers assigned to transcribed segments")
163
+
164
+ except Exception as e:
165
+ logger.error(f"❌ WhisperX pipeline failed: {e}")
166
 
167
  # Extract timestamps, text, and speaker IDs
168
  transcript_with_speakers = [