File size: 2,659 Bytes
bd97be7
9fe4dba
b40af2a
d07525d
e88a1f3
d07525d
e88a1f3
bd97be7
e88a1f3
8057378
 
e88a1f3
8057378
e88a1f3
 
8057378
e88a1f3
8057378
e22e17f
e88a1f3
e22e17f
e88a1f3
 
 
8057378
b98ca5e
 
e22e17f
8057378
 
e22e17f
8057378
 
 
525ee37
 
5f36451
d88ec40
525ee37
 
d88ec40
525ee37
 
 
e88a1f3
525ee37
e88a1f3
525ee37
 
 
 
 
 
 
 
e22e17f
 
525ee37
 
 
 
b40af2a
525ee37
 
 
 
 
b40af2a
525ee37
6befe57
525ee37
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import torch
from transformers import pipeline
import librosa
import soundfile as sf
import spaces
import os

def split_audio(audio_data, sr, chunk_duration=30):
    """Split audio into chunks of chunk_duration seconds."""
    chunks = []
    for start in range(0, len(audio_data), int(chunk_duration * sr)):
        end = start + int(chunk_duration * sr)
        chunks.append(audio_data[start:end])
    return chunks

def transcribe_long_audio(audio_input, transcriber, chunk_duration=30):
    """Transcribe long audio by splitting into smaller chunks."""
    if isinstance(audio_input, str):  # File path
        audio_data, sr = librosa.load(audio_input, sr=None)
    else:  # Raw audio data (numpy array)
        audio_data, sr = audio_input

    chunks = split_audio(audio_data, sr, chunk_duration)
    transcriptions = []
    for i, chunk in enumerate(chunks):
        temp_path = f"temp_chunk_{i}.wav"
        sf.write(temp_path, chunk, sr)  # Save chunk as WAV
        transcription = transcriber(temp_path)["text"]
        transcriptions.append(transcription)
        os.remove(temp_path)  # Cleanup temp files
    return " ".join(transcriptions)

@spaces.GPU(duration=3)
def main():
    device = 0 if torch.cuda.is_available() else -1

    try:
        transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    except Exception as e:
        print(f"Error loading models: {e}")
        raise

    def process_audio(audio_input):
        try:
            transcription = transcribe_long_audio(audio_input, transcriber, chunk_duration=30)
            summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
            return transcription, summary
        except Exception as e:
            return f"Error processing audio: {e}", ""

    with gr.Blocks() as interface:
        with gr.Row():
            with gr.Column():
                # No 'source' argument; recording enabled by default
                audio_input = gr.Audio(type="numpy", label="Record or Upload Audio")
                process_button = gr.Button("Process Audio")
            with gr.Column():
                transcription_output = gr.Textbox(label="Full Transcription", lines=10)
                summary_output = gr.Textbox(label="Summary", lines=5)

        process_button.click(
            process_audio,
            inputs=[audio_input],
            outputs=[transcription_output, summary_output]
        )

    interface.launch(share=True)

if __name__ == "__main__":
    main()