Spaces:
Running
Running
File size: 2,659 Bytes
bd97be7 9fe4dba b40af2a d07525d e88a1f3 d07525d e88a1f3 bd97be7 e88a1f3 8057378 e88a1f3 8057378 e88a1f3 8057378 e88a1f3 8057378 e22e17f e88a1f3 e22e17f e88a1f3 8057378 b98ca5e e22e17f 8057378 e22e17f 8057378 525ee37 5f36451 d88ec40 525ee37 d88ec40 525ee37 e88a1f3 525ee37 e88a1f3 525ee37 e22e17f 525ee37 b40af2a 525ee37 b40af2a 525ee37 6befe57 525ee37 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import gradio as gr
import torch
from transformers import pipeline
import librosa
import soundfile as sf
import spaces
import os
def split_audio(audio_data, sr, chunk_duration=30):
"""Split audio into chunks of chunk_duration seconds."""
chunks = []
for start in range(0, len(audio_data), int(chunk_duration * sr)):
end = start + int(chunk_duration * sr)
chunks.append(audio_data[start:end])
return chunks
def transcribe_long_audio(audio_input, transcriber, chunk_duration=30):
"""Transcribe long audio by splitting into smaller chunks."""
if isinstance(audio_input, str): # File path
audio_data, sr = librosa.load(audio_input, sr=None)
else: # Raw audio data (numpy array)
audio_data, sr = audio_input
chunks = split_audio(audio_data, sr, chunk_duration)
transcriptions = []
for i, chunk in enumerate(chunks):
temp_path = f"temp_chunk_{i}.wav"
sf.write(temp_path, chunk, sr) # Save chunk as WAV
transcription = transcriber(temp_path)["text"]
transcriptions.append(transcription)
os.remove(temp_path) # Cleanup temp files
return " ".join(transcriptions)
@spaces.GPU(duration=3)
def main():
device = 0 if torch.cuda.is_available() else -1
try:
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
except Exception as e:
print(f"Error loading models: {e}")
raise
def process_audio(audio_input):
try:
transcription = transcribe_long_audio(audio_input, transcriber, chunk_duration=30)
summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
return transcription, summary
except Exception as e:
return f"Error processing audio: {e}", ""
with gr.Blocks() as interface:
with gr.Row():
with gr.Column():
# No 'source' argument; recording enabled by default
audio_input = gr.Audio(type="numpy", label="Record or Upload Audio")
process_button = gr.Button("Process Audio")
with gr.Column():
transcription_output = gr.Textbox(label="Full Transcription", lines=10)
summary_output = gr.Textbox(label="Summary", lines=5)
process_button.click(
process_audio,
inputs=[audio_input],
outputs=[transcription_output, summary_output]
)
interface.launch(share=True)
if __name__ == "__main__":
main()
|