Spaces:
Build error
Build error
Update audio_processing.py
Browse files- audio_processing.py +19 -11
audio_processing.py
CHANGED
@@ -38,10 +38,14 @@ def process_audio(audio_file, translate=False, model_size="small"):
|
|
38 |
audio = whisperx.load_audio(audio_file)
|
39 |
model = whisperx.load_model(model_size, device, compute_type=compute_type)
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
|
|
|
|
45 |
|
46 |
chunks = preprocess_audio(audio)
|
47 |
|
@@ -71,16 +75,19 @@ def process_audio(audio_file, translate=False, model_size="small"):
|
|
71 |
print(f"Skipping segment in overlap with next chunk: {segment_start:.2f} - {segment_end:.2f}")
|
72 |
continue
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
|
|
|
|
|
|
78 |
|
79 |
segment = {
|
80 |
"start": segment_start,
|
81 |
"end": segment_end,
|
82 |
"language": lang,
|
83 |
-
"speaker":
|
84 |
"text": t_seg["text"],
|
85 |
}
|
86 |
|
@@ -121,11 +128,12 @@ def merge_nearby_segments(segments, time_threshold=0.5, similarity_threshold=0.7
|
|
121 |
if match.size / len(segment['text']) > similarity_threshold:
|
122 |
# Merge the segments
|
123 |
merged_text = merged[-1]['text'] + segment['text'][match.b + match.size:]
|
124 |
-
merged_translated = merged[-1]
|
125 |
|
126 |
merged[-1]['end'] = segment['end']
|
127 |
merged[-1]['text'] = merged_text
|
128 |
-
|
|
|
129 |
else:
|
130 |
# If no significant overlap, append as a new segment
|
131 |
merged.append(segment)
|
|
|
38 |
audio = whisperx.load_audio(audio_file)
|
39 |
model = whisperx.load_model(model_size, device, compute_type=compute_type)
|
40 |
|
41 |
+
# Try to initialize diarization pipeline, but proceed without it if there's an error
|
42 |
+
try:
|
43 |
+
diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
|
44 |
+
diarization_pipeline = diarization_pipeline.to(torch.device(device))
|
45 |
+
diarization_result = diarization_pipeline({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000})
|
46 |
+
except Exception as e:
|
47 |
+
logger.warning(f"Diarization pipeline initialization failed: {str(e)}. Proceeding without diarization.")
|
48 |
+
diarization_result = None
|
49 |
|
50 |
chunks = preprocess_audio(audio)
|
51 |
|
|
|
75 |
print(f"Skipping segment in overlap with next chunk: {segment_start:.2f} - {segment_end:.2f}")
|
76 |
continue
|
77 |
|
78 |
+
speaker = "Unknown"
|
79 |
+
if diarization_result is not None:
|
80 |
+
speakers = []
|
81 |
+
for turn, track, spk in diarization_result.itertracks(yield_label=True):
|
82 |
+
if turn.start <= segment_end and turn.end >= segment_start:
|
83 |
+
speakers.append(spk)
|
84 |
+
speaker = max(set(speakers), key=speakers.count) if speakers else "Unknown"
|
85 |
|
86 |
segment = {
|
87 |
"start": segment_start,
|
88 |
"end": segment_end,
|
89 |
"language": lang,
|
90 |
+
"speaker": speaker,
|
91 |
"text": t_seg["text"],
|
92 |
}
|
93 |
|
|
|
128 |
if match.size / len(segment['text']) > similarity_threshold:
|
129 |
# Merge the segments
|
130 |
merged_text = merged[-1]['text'] + segment['text'][match.b + match.size:]
|
131 |
+
merged_translated = merged[-1].get('translated', '') + segment.get('translated', '')[match.b + match.size:]
|
132 |
|
133 |
merged[-1]['end'] = segment['end']
|
134 |
merged[-1]['text'] = merged_text
|
135 |
+
if 'translated' in segment:
|
136 |
+
merged[-1]['translated'] = merged_translated
|
137 |
else:
|
138 |
# If no significant overlap, append as a new segment
|
139 |
merged.append(segment)
|