Manyue-DataScientist commited on
Commit
83bc687
Β·
verified Β·
1 Parent(s): da59af0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -34
app.py CHANGED
@@ -11,18 +11,12 @@ import io
11
  @st.cache_resource
12
  def load_models():
13
  try:
14
- # Updated to 3.1 with parameters
15
  diarization = Pipeline.from_pretrained(
16
  "pyannote/speaker-diarization-3.1",
17
  use_auth_token=st.secrets["hf_token"]
18
- ).instantiate({
19
- "onset": 0.3,
20
- "offset": 0.3,
21
- "min_duration_on": 0.1,
22
- "min_duration_off": 0.1
23
- })
24
 
25
- transcriber = whisper.load_model("base")
26
 
27
  summarizer = tf_pipeline(
28
  "summarization",
@@ -78,7 +72,7 @@ def process_audio(audio_file, max_duration=600):
78
 
79
  return {
80
  "diarization": diarization_result,
81
- "transcription": transcription, # Return full transcription object
82
  "summary": summary[0]["summary_text"]
83
  }
84
 
@@ -86,26 +80,24 @@ def process_audio(audio_file, max_duration=600):
86
  st.error(f"Error processing audio: {str(e)}")
87
  return None
88
 
89
- def format_speaker_segments(diarization_result, transcription):
90
  formatted_segments = []
91
- audio_duration = transcription.get('duration', 0)
92
 
93
  for turn, _, speaker in diarization_result.itertracks(yield_label=True):
94
- # Skip invalid timestamps
95
- if turn.start > audio_duration or turn.end > audio_duration:
96
- continue
97
-
98
- # Only add segments with meaningful duration
99
- if (turn.end - turn.start) >= 0.1: # 100ms minimum
100
  formatted_segments.append({
101
  'speaker': speaker,
102
- 'start': turn.start,
103
- 'end': turn.end,
104
- 'duration': turn.end - turn.start
105
  })
106
 
107
  return formatted_segments
108
 
 
 
 
 
 
109
  def main():
110
  st.title("Multi-Speaker Audio Analyzer")
111
  st.write("Upload an audio file (MP3/WAV) up to 5 minutes long for best performance")
@@ -129,30 +121,21 @@ def main():
129
 
130
  with tab1:
131
  st.write("Speaker Timeline:")
 
132
 
133
- segments = format_speaker_segments(
134
- results["diarization"],
135
- results["transcription"]
136
- )
137
-
138
- # Display segments with proper time formatting
139
  for segment in segments:
140
  col1, col2 = st.columns([2,8])
141
 
142
  with col1:
143
  speaker_num = int(segment['speaker'].split('_')[1])
144
- colors = ['πŸ”΅', 'πŸ”΄'] # Simplified to two colors
145
  speaker_color = colors[speaker_num % len(colors)]
146
  st.write(f"{speaker_color} {segment['speaker']}")
147
 
148
  with col2:
149
- mm_start = int(segment['start'] // 60)
150
- ss_start = segment['start'] % 60
151
- mm_end = int(segment['end'] // 60)
152
- ss_end = segment['end'] % 60
153
-
154
- time_str = f"{mm_start:02d}:{ss_start:05.2f} β†’ {mm_end:02d}:{ss_end:05.2f}"
155
- st.write(time_str)
156
 
157
  st.markdown("---")
158
 
 
11
  @st.cache_resource
12
  def load_models():
13
  try:
 
14
  diarization = Pipeline.from_pretrained(
15
  "pyannote/speaker-diarization-3.1",
16
  use_auth_token=st.secrets["hf_token"]
17
+ )
 
 
 
 
 
18
 
19
+ transcriber = whisper.load_model("small")
20
 
21
  summarizer = tf_pipeline(
22
  "summarization",
 
72
 
73
  return {
74
  "diarization": diarization_result,
75
+ "transcription": transcription,
76
  "summary": summary[0]["summary_text"]
77
  }
78
 
 
80
  st.error(f"Error processing audio: {str(e)}")
81
  return None
82
 
83
+ def format_speaker_segments(diarization_result):
84
  formatted_segments = []
 
85
 
86
  for turn, _, speaker in diarization_result.itertracks(yield_label=True):
87
+ if turn.start is not None and turn.end is not None:
 
 
 
 
 
88
  formatted_segments.append({
89
  'speaker': speaker,
90
+ 'start': float(turn.start),
91
+ 'end': float(turn.end)
 
92
  })
93
 
94
  return formatted_segments
95
 
96
+ def format_timestamp(seconds):
97
+ minutes = int(seconds // 60)
98
+ seconds = seconds % 60
99
+ return f"{minutes:02d}:{seconds:05.2f}"
100
+
101
  def main():
102
  st.title("Multi-Speaker Audio Analyzer")
103
  st.write("Upload an audio file (MP3/WAV) up to 5 minutes long for best performance")
 
121
 
122
  with tab1:
123
  st.write("Speaker Timeline:")
124
+ segments = format_speaker_segments(results["diarization"])
125
 
 
 
 
 
 
 
126
  for segment in segments:
127
  col1, col2 = st.columns([2,8])
128
 
129
  with col1:
130
  speaker_num = int(segment['speaker'].split('_')[1])
131
+ colors = ['πŸ”΅', 'πŸ”΄'] # Two colors for alternating speakers
132
  speaker_color = colors[speaker_num % len(colors)]
133
  st.write(f"{speaker_color} {segment['speaker']}")
134
 
135
  with col2:
136
+ start_time = format_timestamp(segment['start'])
137
+ end_time = format_timestamp(segment['end'])
138
+ st.write(f"{start_time} β†’ {end_time}")
 
 
 
 
139
 
140
  st.markdown("---")
141