Commit
·
b1426fb
1
Parent(s):
965e524
Changed Folder Structure
Browse filesChanged code format and folder structure to make it more available for optimization in future.
- README.md +16 -13
- app.py +114 -139
- src/__init__.py +13 -0
- src/models/__init__.py +8 -0
- src/models/diarization.py +44 -0
- src/models/summarization.py +44 -0
- src/models/transcription.py +36 -0
- src/utils/__init__.py +7 -0
- src/utils/audio_processor.py +43 -0
- src/utils/formatter.py +59 -0
README.md
CHANGED
|
@@ -1,13 +1,16 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Multi-Speaker Audio Analyzer
|
| 2 |
+
|
| 3 |
+
A Streamlit application that performs speaker diarization, transcription, and summarization on audio files.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
- Speaker Diarization using Pyannote
|
| 7 |
+
- Transcription using Whisper
|
| 8 |
+
- Summarization using BART
|
| 9 |
+
|
| 10 |
+
## Setup
|
| 11 |
+
1. Install requirements: `pip install -r requirements.txt`
|
| 12 |
+
2. Add HuggingFace token to Streamlit secrets
|
| 13 |
+
3. Run app: `streamlit run app.py`
|
| 14 |
+
|
| 15 |
+
## Usage
|
| 16 |
+
Upload an audio file (MP3/WAV) and click "Analyze Audio" to process.
|
app.py
CHANGED
|
@@ -1,129 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
-
from
|
| 3 |
-
import
|
| 4 |
-
import
|
|
|
|
|
|
|
| 5 |
import os
|
| 6 |
-
import torch
|
| 7 |
-
from transformers import pipeline as tf_pipeline
|
| 8 |
-
from pydub import AudioSegment
|
| 9 |
-
import io
|
| 10 |
|
|
|
|
| 11 |
@st.cache_resource
|
| 12 |
def load_models():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
try:
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
use_auth_token=st.secrets["hf_token"]
|
| 17 |
-
)
|
| 18 |
|
| 19 |
-
transcriber =
|
|
|
|
| 20 |
|
| 21 |
-
summarizer =
|
| 22 |
-
|
| 23 |
-
model="facebook/bart-large-cnn",
|
| 24 |
-
device=0 if torch.cuda.is_available() else -1
|
| 25 |
-
)
|
| 26 |
|
| 27 |
-
if not
|
| 28 |
raise ValueError("One or more models failed to load")
|
| 29 |
|
| 30 |
-
return
|
| 31 |
except Exception as e:
|
| 32 |
st.error(f"Error loading models: {str(e)}")
|
| 33 |
st.error("Debug info: Check if HF token is valid and has necessary permissions")
|
| 34 |
return None, None, None
|
| 35 |
|
| 36 |
def process_audio(audio_file, max_duration=600):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
try:
|
| 38 |
-
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
else:
|
| 45 |
-
audio = AudioSegment.from_wav(audio_bytes)
|
| 46 |
-
|
| 47 |
-
# Standardize format
|
| 48 |
-
audio = audio.set_frame_rate(16000)
|
| 49 |
-
audio = audio.set_channels(1)
|
| 50 |
-
audio = audio.set_sample_width(2)
|
| 51 |
-
|
| 52 |
-
audio.export(
|
| 53 |
-
tmp.name,
|
| 54 |
-
format="wav",
|
| 55 |
-
parameters=["-ac", "1", "-ar", "16000"]
|
| 56 |
-
)
|
| 57 |
-
tmp_path = tmp.name
|
| 58 |
-
|
| 59 |
-
except Exception as e:
|
| 60 |
-
st.error(f"Error converting audio: {str(e)}")
|
| 61 |
-
return None
|
| 62 |
-
|
| 63 |
-
diarization, transcriber, summarizer = load_models()
|
| 64 |
-
if not all([diarization, transcriber, summarizer]):
|
| 65 |
-
return "Model loading failed"
|
| 66 |
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
with st.spinner("Generating summary..."):
|
| 74 |
-
summary = summarizer(transcription["text"], max_length=130, min_length=30)
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
| 84 |
except Exception as e:
|
| 85 |
st.error(f"Error processing audio: {str(e)}")
|
| 86 |
return None
|
| 87 |
|
| 88 |
-
def format_speaker_segments(diarization_result, transcription):
|
| 89 |
-
if diarization_result is None:
|
| 90 |
-
return []
|
| 91 |
-
|
| 92 |
-
formatted_segments = []
|
| 93 |
-
whisper_segments = transcription.get('segments', [])
|
| 94 |
-
|
| 95 |
-
try:
|
| 96 |
-
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
|
| 97 |
-
current_text = ""
|
| 98 |
-
# Find matching whisper segments for this speaker's time window
|
| 99 |
-
for w_segment in whisper_segments:
|
| 100 |
-
w_start = float(w_segment['start'])
|
| 101 |
-
w_end = float(w_segment['end'])
|
| 102 |
-
|
| 103 |
-
# If whisper segment overlaps with speaker segment
|
| 104 |
-
if (w_start >= turn.start and w_start < turn.end) or \
|
| 105 |
-
(w_end > turn.start and w_end <= turn.end):
|
| 106 |
-
current_text += w_segment['text'].strip() + " "
|
| 107 |
-
|
| 108 |
-
formatted_segments.append({
|
| 109 |
-
'speaker': str(speaker),
|
| 110 |
-
'start': float(turn.start),
|
| 111 |
-
'end': float(turn.end),
|
| 112 |
-
'text': current_text.strip()
|
| 113 |
-
})
|
| 114 |
-
|
| 115 |
-
except Exception as e:
|
| 116 |
-
st.error(f"Error formatting segments: {str(e)}")
|
| 117 |
-
return []
|
| 118 |
-
|
| 119 |
-
return formatted_segments
|
| 120 |
-
|
| 121 |
-
def format_timestamp(seconds):
|
| 122 |
-
minutes = int(seconds // 60)
|
| 123 |
-
seconds = seconds % 60
|
| 124 |
-
return f"{minutes:02d}:{seconds:05.2f}"
|
| 125 |
-
|
| 126 |
def main():
|
|
|
|
| 127 |
st.title("Multi-Speaker Audio Analyzer")
|
| 128 |
st.write("Upload an audio file (MP3/WAV) up to 5 minutes long for best performance")
|
| 129 |
|
|
@@ -144,51 +108,62 @@ def main():
|
|
| 144 |
if results:
|
| 145 |
tab1, tab2, tab3 = st.tabs(["Speakers", "Transcription", "Summary"])
|
| 146 |
|
|
|
|
| 147 |
with tab1:
|
| 148 |
-
|
| 149 |
-
segments = format_speaker_segments(
|
| 150 |
-
results["diarization"],
|
| 151 |
-
results["transcription"]
|
| 152 |
-
)
|
| 153 |
-
|
| 154 |
-
if segments:
|
| 155 |
-
for segment in segments:
|
| 156 |
-
col1, col2, col3 = st.columns([2,3,5])
|
| 157 |
-
|
| 158 |
-
with col1:
|
| 159 |
-
speaker_num = int(segment['speaker'].split('_')[1])
|
| 160 |
-
colors = ['🔵', '🔴']
|
| 161 |
-
speaker_color = colors[speaker_num % len(colors)]
|
| 162 |
-
st.write(f"{speaker_color} {segment['speaker']}")
|
| 163 |
-
|
| 164 |
-
with col2:
|
| 165 |
-
start_time = format_timestamp(segment['start'])
|
| 166 |
-
end_time = format_timestamp(segment['end'])
|
| 167 |
-
st.write(f"{start_time} → {end_time}")
|
| 168 |
-
|
| 169 |
-
with col3:
|
| 170 |
-
if segment['text']:
|
| 171 |
-
st.write(f"\"{segment['text']}\"")
|
| 172 |
-
else:
|
| 173 |
-
st.write("(no speech detected)")
|
| 174 |
-
|
| 175 |
-
st.markdown("---")
|
| 176 |
-
else:
|
| 177 |
-
st.warning("No speaker segments detected")
|
| 178 |
|
|
|
|
| 179 |
with tab2:
|
| 180 |
-
|
| 181 |
-
if "text" in results["transcription"]:
|
| 182 |
-
st.write(results["transcription"]["text"])
|
| 183 |
-
else:
|
| 184 |
-
st.warning("No transcription available")
|
| 185 |
|
|
|
|
| 186 |
with tab3:
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
if __name__ == "__main__":
|
| 194 |
main()
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-Speaker Audio Analyzer
|
| 3 |
+
A Streamlit application that performs speaker diarization, transcription, and summarization on audio files.
|
| 4 |
+
|
| 5 |
+
Author: [Your Name]
|
| 6 |
+
Date: January 2025
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
import streamlit as st
|
| 10 |
+
from src.models.diarization import SpeakerDiarizer
|
| 11 |
+
from src.models.transcription import Transcriber
|
| 12 |
+
from src.models.summarization import Summarizer
|
| 13 |
+
from src.utils.audio_processor import AudioProcessor
|
| 14 |
+
from src.utils.formatter import TimeFormatter
|
| 15 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
+
# Cache for model loading
|
| 18 |
@st.cache_resource
|
| 19 |
def load_models():
|
| 20 |
+
"""
|
| 21 |
+
Load and cache all required models.
|
| 22 |
+
|
| 23 |
+
Returns:
|
| 24 |
+
tuple: (diarizer, transcriber, summarizer) or (None, None, None) if loading fails
|
| 25 |
+
"""
|
| 26 |
try:
|
| 27 |
+
diarizer = SpeakerDiarizer(st.secrets["hf_token"])
|
| 28 |
+
diarizer_model = diarizer.load_model()
|
|
|
|
|
|
|
| 29 |
|
| 30 |
+
transcriber = Transcriber()
|
| 31 |
+
transcriber_model = transcriber.load_model()
|
| 32 |
|
| 33 |
+
summarizer = Summarizer()
|
| 34 |
+
summarizer_model = summarizer.load_model()
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
+
if not all([diarizer_model, transcriber_model, summarizer_model]):
|
| 37 |
raise ValueError("One or more models failed to load")
|
| 38 |
|
| 39 |
+
return diarizer, transcriber, summarizer
|
| 40 |
except Exception as e:
|
| 41 |
st.error(f"Error loading models: {str(e)}")
|
| 42 |
st.error("Debug info: Check if HF token is valid and has necessary permissions")
|
| 43 |
return None, None, None
|
| 44 |
|
| 45 |
def process_audio(audio_file, max_duration=600):
|
| 46 |
+
"""
|
| 47 |
+
Process the uploaded audio file through all models.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
audio_file: Uploaded audio file
|
| 51 |
+
max_duration (int): Maximum duration in seconds
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
dict: Processing results containing diarization, transcription, and summary
|
| 55 |
+
"""
|
| 56 |
try:
|
| 57 |
+
# Process audio file
|
| 58 |
+
audio_processor = AudioProcessor()
|
| 59 |
+
tmp_path = audio_processor.standardize_audio(audio_file)
|
| 60 |
|
| 61 |
+
# Load models
|
| 62 |
+
diarizer, transcriber, summarizer = load_models()
|
| 63 |
+
if not all([diarizer, transcriber, summarizer]):
|
| 64 |
+
return "Model loading failed"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
+
# Process with each model
|
| 67 |
+
with st.spinner("Identifying speakers..."):
|
| 68 |
+
diarization_result = diarizer.process(tmp_path)
|
| 69 |
+
|
| 70 |
+
with st.spinner("Transcribing audio..."):
|
| 71 |
+
transcription = transcriber.process(tmp_path)
|
| 72 |
|
| 73 |
+
with st.spinner("Generating summary..."):
|
| 74 |
+
summary = summarizer.process(transcription["text"])
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
+
# Cleanup
|
| 77 |
+
os.unlink(tmp_path)
|
| 78 |
+
|
| 79 |
+
return {
|
| 80 |
+
"diarization": diarization_result,
|
| 81 |
+
"transcription": transcription,
|
| 82 |
+
"summary": summary[0]["summary_text"]
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
except Exception as e:
|
| 86 |
st.error(f"Error processing audio: {str(e)}")
|
| 87 |
return None
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
def main():
|
| 90 |
+
"""Main application function."""
|
| 91 |
st.title("Multi-Speaker Audio Analyzer")
|
| 92 |
st.write("Upload an audio file (MP3/WAV) up to 5 minutes long for best performance")
|
| 93 |
|
|
|
|
| 108 |
if results:
|
| 109 |
tab1, tab2, tab3 = st.tabs(["Speakers", "Transcription", "Summary"])
|
| 110 |
|
| 111 |
+
# Display speaker timeline
|
| 112 |
with tab1:
|
| 113 |
+
display_speaker_timeline(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
+
# Display transcription
|
| 116 |
with tab2:
|
| 117 |
+
display_transcription(results)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
# Display summary
|
| 120 |
with tab3:
|
| 121 |
+
display_summary(results)
|
| 122 |
+
|
| 123 |
+
def display_speaker_timeline(results):
|
| 124 |
+
"""Display speaker diarization results in a timeline format."""
|
| 125 |
+
st.write("Speaker Timeline:")
|
| 126 |
+
segments = TimeFormatter.format_speaker_segments(
|
| 127 |
+
results["diarization"],
|
| 128 |
+
results["transcription"]
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
if segments:
|
| 132 |
+
for segment in segments:
|
| 133 |
+
col1, col2, col3 = st.columns([2,3,5])
|
| 134 |
+
|
| 135 |
+
with col1:
|
| 136 |
+
display_speaker_info(segment)
|
| 137 |
+
|
| 138 |
+
with col2:
|
| 139 |
+
display_timestamp(segment)
|
| 140 |
+
|
| 141 |
+
with col3:
|
| 142 |
+
display_text(segment)
|
| 143 |
+
|
| 144 |
+
st.markdown("---")
|
| 145 |
+
else:
|
| 146 |
+
st.warning("No speaker segments detected")
|
| 147 |
+
|
| 148 |
+
def display_speaker_info(segment):
|
| 149 |
+
"""Display speaker information with color coding."""
|
| 150 |
+
speaker_num = int(segment['speaker'].split('_')[1])
|
| 151 |
+
colors = ['🔵', '🔴']
|
| 152 |
+
speaker_color = colors[speaker_num % len(colors)]
|
| 153 |
+
st.write(f"{speaker_color} {segment['speaker']}")
|
| 154 |
+
|
| 155 |
+
def display_timestamp(segment):
|
| 156 |
+
"""Display formatted timestamps."""
|
| 157 |
+
start_time = TimeFormatter.format_timestamp(segment['start'])
|
| 158 |
+
end_time = TimeFormatter.format_timestamp(segment['end'])
|
| 159 |
+
st.write(f"{start_time} → {end_time}")
|
| 160 |
+
|
| 161 |
+
def display_text(segment):
|
| 162 |
+
"""Display speaker's text."""
|
| 163 |
+
if segment['text']:
|
| 164 |
+
st.write(f"\"{segment['text']}\"")
|
| 165 |
+
else:
|
| 166 |
+
st.write("(no speech detected)")
|
| 167 |
|
| 168 |
if __name__ == "__main__":
|
| 169 |
main()
|
src/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Initialize the src package.
|
| 3 |
+
"""
|
| 4 |
+
from src.models import SpeakerDiarizer, Transcriber, Summarizer
|
| 5 |
+
from src.utils import AudioProcessor, TimeFormatter
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
'SpeakerDiarizer',
|
| 9 |
+
'Transcriber',
|
| 10 |
+
'Summarizer',
|
| 11 |
+
'AudioProcessor',
|
| 12 |
+
'TimeFormatter'
|
| 13 |
+
]
|
src/models/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Initialize the models package.
|
| 3 |
+
"""
|
| 4 |
+
from .diarization import SpeakerDiarizer
|
| 5 |
+
from .transcription import Transcriber
|
| 6 |
+
from .summarization import Summarizer
|
| 7 |
+
|
| 8 |
+
__all__ = ['SpeakerDiarizer', 'Transcriber', 'Summarizer']
|
src/models/diarization.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speaker Diarization Model Handler
|
| 3 |
+
Manages the pyannote-audio model for speaker diarization tasks.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pyannote.audio import Pipeline
|
| 7 |
+
import streamlit as st
|
| 8 |
+
|
| 9 |
+
class SpeakerDiarizer:
|
| 10 |
+
def __init__(self, token: str):
|
| 11 |
+
"""Initialize the diarization model.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
token (str): HuggingFace authentication token
|
| 15 |
+
"""
|
| 16 |
+
self.token = token
|
| 17 |
+
self.model = None
|
| 18 |
+
|
| 19 |
+
def load_model(self):
|
| 20 |
+
"""Load the pyannote speaker diarization model."""
|
| 21 |
+
try:
|
| 22 |
+
self.model = Pipeline.from_pretrained(
|
| 23 |
+
"pyannote/speaker-diarization",
|
| 24 |
+
use_auth_token=self.token
|
| 25 |
+
)
|
| 26 |
+
return self.model
|
| 27 |
+
except Exception as e:
|
| 28 |
+
st.error(f"Error loading diarization model: {str(e)}")
|
| 29 |
+
return None
|
| 30 |
+
|
| 31 |
+
def process(self, audio_path: str):
|
| 32 |
+
"""Process audio file for speaker diarization.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
audio_path (str): Path to the audio file
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
dict: Diarization results
|
| 39 |
+
"""
|
| 40 |
+
try:
|
| 41 |
+
return self.model(audio_path)
|
| 42 |
+
except Exception as e:
|
| 43 |
+
st.error(f"Error in diarization: {str(e)}")
|
| 44 |
+
return None
|
src/models/summarization.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Summarization Model Handler
|
| 3 |
+
Manages the BART model for text summarization.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from transformers import pipeline
|
| 7 |
+
import torch
|
| 8 |
+
import streamlit as st
|
| 9 |
+
|
| 10 |
+
class Summarizer:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
"""Initialize the summarization model."""
|
| 13 |
+
self.model = None
|
| 14 |
+
|
| 15 |
+
def load_model(self):
|
| 16 |
+
"""Load the BART summarization model."""
|
| 17 |
+
try:
|
| 18 |
+
self.model = pipeline(
|
| 19 |
+
"summarization",
|
| 20 |
+
model="facebook/bart-large-cnn",
|
| 21 |
+
device=0 if torch.cuda.is_available() else -1
|
| 22 |
+
)
|
| 23 |
+
return self.model
|
| 24 |
+
except Exception as e:
|
| 25 |
+
st.error(f"Error loading summarization model: {str(e)}")
|
| 26 |
+
return None
|
| 27 |
+
|
| 28 |
+
def process(self, text: str, max_length: int = 130, min_length: int = 30):
|
| 29 |
+
"""Process text for summarization.
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
text (str): Text to summarize
|
| 33 |
+
max_length (int): Maximum length of summary
|
| 34 |
+
min_length (int): Minimum length of summary
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
str: Summarized text
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
summary = self.model(text, max_length=max_length, min_length=min_length)
|
| 41 |
+
return summary
|
| 42 |
+
except Exception as e:
|
| 43 |
+
st.error(f"Error in summarization: {str(e)}")
|
| 44 |
+
return None
|
src/models/transcription.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Transcription Model Handler
|
| 3 |
+
Manages the Whisper model for speech-to-text transcription.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import whisper
|
| 7 |
+
import streamlit as st
|
| 8 |
+
|
| 9 |
+
class Transcriber:
|
| 10 |
+
def __init__(self):
|
| 11 |
+
"""Initialize the transcription model."""
|
| 12 |
+
self.model = None
|
| 13 |
+
|
| 14 |
+
def load_model(self):
|
| 15 |
+
"""Load the Whisper transcription model."""
|
| 16 |
+
try:
|
| 17 |
+
self.model = whisper.load_model("base")
|
| 18 |
+
return self.model
|
| 19 |
+
except Exception as e:
|
| 20 |
+
st.error(f"Error loading transcription model: {str(e)}")
|
| 21 |
+
return None
|
| 22 |
+
|
| 23 |
+
def process(self, audio_path: str):
|
| 24 |
+
"""Process audio file for transcription.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
audio_path (str): Path to the audio file
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
dict: Transcription results
|
| 31 |
+
"""
|
| 32 |
+
try:
|
| 33 |
+
return self.model.transcribe(audio_path)
|
| 34 |
+
except Exception as e:
|
| 35 |
+
st.error(f"Error in transcription: {str(e)}")
|
| 36 |
+
return None
|
src/utils/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Initialize the utils package.
|
| 3 |
+
"""
|
| 4 |
+
from .audio_processor import AudioProcessor
|
| 5 |
+
from .formatter import TimeFormatter
|
| 6 |
+
|
| 7 |
+
__all__ = ['AudioProcessor', 'TimeFormatter']
|
src/utils/audio_processor.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Audio Processing Utilities
|
| 3 |
+
Handles audio file preprocessing and standardization.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from pydub import AudioSegment
|
| 7 |
+
import io
|
| 8 |
+
import tempfile
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
class AudioProcessor:
|
| 12 |
+
@staticmethod
|
| 13 |
+
def standardize_audio(audio_file):
|
| 14 |
+
"""Standardize audio file to required format.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
audio_file: Uploaded audio file
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
str: Path to processed audio file
|
| 21 |
+
"""
|
| 22 |
+
try:
|
| 23 |
+
audio_bytes = io.BytesIO(audio_file.getvalue())
|
| 24 |
+
|
| 25 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
| 26 |
+
if audio_file.name.lower().endswith('.mp3'):
|
| 27 |
+
audio = AudioSegment.from_mp3(audio_bytes)
|
| 28 |
+
else:
|
| 29 |
+
audio = AudioSegment.from_wav(audio_bytes)
|
| 30 |
+
|
| 31 |
+
audio = audio.set_frame_rate(16000)
|
| 32 |
+
audio = audio.set_channels(1)
|
| 33 |
+
audio = audio.set_sample_width(2)
|
| 34 |
+
|
| 35 |
+
audio.export(
|
| 36 |
+
tmp.name,
|
| 37 |
+
format="wav",
|
| 38 |
+
parameters=["-ac", "1", "-ar", "16000"]
|
| 39 |
+
)
|
| 40 |
+
return tmp.name
|
| 41 |
+
|
| 42 |
+
except Exception as e:
|
| 43 |
+
raise Exception(f"Error processing audio: {str(e)}")
|
src/utils/formatter.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Formatting utilities for timestamps and speaker segments.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
class TimeFormatter:
|
| 6 |
+
@staticmethod
|
| 7 |
+
def format_timestamp(seconds: float) -> str:
|
| 8 |
+
"""Format seconds into MM:SS.ss format.
|
| 9 |
+
|
| 10 |
+
Args:
|
| 11 |
+
seconds (float): Time in seconds
|
| 12 |
+
|
| 13 |
+
Returns:
|
| 14 |
+
str: Formatted time string
|
| 15 |
+
"""
|
| 16 |
+
minutes = int(seconds // 60)
|
| 17 |
+
seconds = seconds % 60
|
| 18 |
+
return f"{minutes:02d}:{seconds:05.2f}"
|
| 19 |
+
|
| 20 |
+
@staticmethod
|
| 21 |
+
def format_speaker_segments(diarization_result, transcription):
|
| 22 |
+
"""Format speaker segments with transcribed text.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
diarization_result: Diarization model output
|
| 26 |
+
transcription: Whisper transcription output
|
| 27 |
+
|
| 28 |
+
Returns:
|
| 29 |
+
list: Formatted speaker segments
|
| 30 |
+
"""
|
| 31 |
+
if diarization_result is None:
|
| 32 |
+
return []
|
| 33 |
+
|
| 34 |
+
formatted_segments = []
|
| 35 |
+
whisper_segments = transcription.get('segments', [])
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
for turn, _, speaker in diarization_result.itertracks(yield_label=True):
|
| 39 |
+
current_text = ""
|
| 40 |
+
for w_segment in whisper_segments:
|
| 41 |
+
w_start = float(w_segment['start'])
|
| 42 |
+
w_end = float(w_segment['end'])
|
| 43 |
+
|
| 44 |
+
if (w_start >= turn.start and w_start < turn.end) or \
|
| 45 |
+
(w_end > turn.start and w_end <= turn.end):
|
| 46 |
+
current_text += w_segment['text'].strip() + " "
|
| 47 |
+
|
| 48 |
+
formatted_segments.append({
|
| 49 |
+
'speaker': str(speaker),
|
| 50 |
+
'start': float(turn.start),
|
| 51 |
+
'end': float(turn.end),
|
| 52 |
+
'text': current_text.strip()
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
except Exception as e:
|
| 56 |
+
print(f"Error formatting segments: {str(e)}")
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
return formatted_segments
|