Kr08 commited on
Commit
5cf5423
·
verified ·
1 Parent(s): 9148c64

Update audio_processing.py

Browse files
Files changed (1) hide show
  1. audio_processing.py +19 -11
audio_processing.py CHANGED
@@ -38,10 +38,14 @@ def process_audio(audio_file, translate=False, model_size="small"):
38
  audio = whisperx.load_audio(audio_file)
39
  model = whisperx.load_model(model_size, device, compute_type=compute_type)
40
 
41
- diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
42
- diarization_pipeline = diarization_pipeline.to(torch.device(device))
43
-
44
- diarization_result = diarization_pipeline({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000})
 
 
 
 
45
 
46
  chunks = preprocess_audio(audio)
47
 
@@ -71,16 +75,19 @@ def process_audio(audio_file, translate=False, model_size="small"):
71
  print(f"Skipping segment in overlap with next chunk: {segment_start:.2f} - {segment_end:.2f}")
72
  continue
73
 
74
- speakers = []
75
- for turn, track, speaker in diarization_result.itertracks(yield_label=True):
76
- if turn.start <= segment_end and turn.end >= segment_start:
77
- speakers.append(speaker)
 
 
 
78
 
79
  segment = {
80
  "start": segment_start,
81
  "end": segment_end,
82
  "language": lang,
83
- "speaker": max(set(speakers), key=speakers.count) if speakers else "Unknown",
84
  "text": t_seg["text"],
85
  }
86
 
@@ -121,11 +128,12 @@ def merge_nearby_segments(segments, time_threshold=0.5, similarity_threshold=0.7
121
  if match.size / len(segment['text']) > similarity_threshold:
122
  # Merge the segments
123
  merged_text = merged[-1]['text'] + segment['text'][match.b + match.size:]
124
- merged_translated = merged[-1]['translated'] + segment['translated'][match.b + match.size:]
125
 
126
  merged[-1]['end'] = segment['end']
127
  merged[-1]['text'] = merged_text
128
- merged[-1]['translated'] = merged_translated
 
129
  else:
130
  # If no significant overlap, append as a new segment
131
  merged.append(segment)
 
38
  audio = whisperx.load_audio(audio_file)
39
  model = whisperx.load_model(model_size, device, compute_type=compute_type)
40
 
41
+ # Try to initialize diarization pipeline, but proceed without it if there's an error
42
+ try:
43
+ diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
44
+ diarization_pipeline = diarization_pipeline.to(torch.device(device))
45
+ diarization_result = diarization_pipeline({"waveform": torch.from_numpy(audio).unsqueeze(0), "sample_rate": 16000})
46
+ except Exception as e:
47
+ logger.warning(f"Diarization pipeline initialization failed: {str(e)}. Proceeding without diarization.")
48
+ diarization_result = None
49
 
50
  chunks = preprocess_audio(audio)
51
 
 
75
  print(f"Skipping segment in overlap with next chunk: {segment_start:.2f} - {segment_end:.2f}")
76
  continue
77
 
78
+ speaker = "Unknown"
79
+ if diarization_result is not None:
80
+ speakers = []
81
+ for turn, track, spk in diarization_result.itertracks(yield_label=True):
82
+ if turn.start <= segment_end and turn.end >= segment_start:
83
+ speakers.append(spk)
84
+ speaker = max(set(speakers), key=speakers.count) if speakers else "Unknown"
85
 
86
  segment = {
87
  "start": segment_start,
88
  "end": segment_end,
89
  "language": lang,
90
+ "speaker": speaker,
91
  "text": t_seg["text"],
92
  }
93
 
 
128
  if match.size / len(segment['text']) > similarity_threshold:
129
  # Merge the segments
130
  merged_text = merged[-1]['text'] + segment['text'][match.b + match.size:]
131
+ merged_translated = merged[-1].get('translated', '') + segment.get('translated', '')[match.b + match.size:]
132
 
133
  merged[-1]['end'] = segment['end']
134
  merged[-1]['text'] = merged_text
135
+ if 'translated' in segment:
136
+ merged[-1]['translated'] = merged_translated
137
  else:
138
  # If no significant overlap, append as a new segment
139
  merged.append(segment)