Update app.py
Browse files
app.py
CHANGED
@@ -137,28 +137,32 @@ def transcribe_video_with_speakers(video_path):
|
|
137 |
# Set up device
|
138 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
139 |
logger.info(f"Using device: {device}")
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
|
|
|
|
|
|
|
|
162 |
|
163 |
# Extract timestamps, text, and speaker IDs
|
164 |
transcript_with_speakers = [
|
|
|
137 |
# Set up device
|
138 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
139 |
logger.info(f"Using device: {device}")
|
140 |
+
|
141 |
+
try:
|
142 |
+
# Load a medium model with float32 for broader compatibility
|
143 |
+
model = whisperx.load_model("medium", device=device, compute_type="float32")
|
144 |
+
logger.info("WhisperX model loaded")
|
145 |
+
|
146 |
+
# Transcribe
|
147 |
+
result = model.transcribe(audio_path)
|
148 |
+
logger.info("Audio transcription completed")
|
149 |
+
|
150 |
+
# Alignment
|
151 |
+
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
152 |
+
result = whisperx.align(result["segments"], model_a, metadata, audio_path, device)
|
153 |
+
logger.info("Transcription alignment completed")
|
154 |
+
|
155 |
+
# Diarization (works independently of Whisper model size)
|
156 |
+
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
|
157 |
+
diarize_segments = diarize_model(audio_path)
|
158 |
+
logger.info("Speaker diarization completed")
|
159 |
+
|
160 |
+
# Assign speakers
|
161 |
+
result = whisperx.assign_word_speakers(diarize_segments, result)
|
162 |
+
logger.info("Speakers assigned to transcribed segments")
|
163 |
+
|
164 |
+
except Exception as e:
|
165 |
+
logger.error(f"❌ WhisperX pipeline failed: {e}")
|
166 |
|
167 |
# Extract timestamps, text, and speaker IDs
|
168 |
transcript_with_speakers = [
|