File size: 3,022 Bytes
bd97be7
9fe4dba
b40af2a
d07525d
e88a1f3
d07525d
e88a1f3
bd97be7
e88a1f3
8057378
 
e88a1f3
8057378
e88a1f3
 
8057378
e88a1f3
8057378
e88a1f3
 
 
 
 
 
 
8057378
b98ca5e
 
e88a1f3
8057378
 
e88a1f3
8057378
 
 
525ee37
 
 
5f36451
d88ec40
525ee37
 
 
d88ec40
525ee37
 
 
 
e88a1f3
525ee37
8057378
e88a1f3
525ee37
 
 
 
 
 
 
 
 
 
e88a1f3
525ee37
 
 
 
b40af2a
525ee37
 
 
 
 
b40af2a
525ee37
 
6befe57
525ee37
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
import torch
from transformers import pipeline
import librosa
import soundfile as sf
import spaces
import os

def split_audio(audio_data, sr, chunk_duration=30):
    """Split audio into chunks of chunk_duration seconds."""
    chunks = []
    for start in range(0, len(audio_data), int(chunk_duration * sr)):
        end = start + int(chunk_duration * sr)
        chunks.append(audio_data[start:end])
    return chunks

def transcribe_long_audio(audio_input, transcriber, chunk_duration=30):
    """Transcribe long audio by splitting into smaller chunks."""
    # Check if audio_input is a file path or raw data
    if isinstance(audio_input, str):
        audio_data, sr = librosa.load(audio_input, sr=None)
    else:  # Raw audio data (e.g., from recording)
        audio_data, sr = audio_input

    chunks = split_audio(audio_data, sr, chunk_duration)
    transcriptions = []
    for i, chunk in enumerate(chunks):
        temp_path = f"temp_chunk_{i}.wav"
        sf.write(temp_path, chunk, sr)  # Save the chunk as a WAV file
        transcription = transcriber(temp_path)["text"]
        transcriptions.append(transcription)
        os.remove(temp_path)  # Clean up temporary files
    return " ".join(transcriptions)

@spaces.GPU(duration=3)
def main():
    # Force GPU if available, fallback to CPU
    device = 0 if torch.cuda.is_available() else -1

    try:
        # Load models with explicit device
        transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)
        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
    except Exception as e:
        print(f"Error loading models: {e}")
        raise

    # Function to process audio
    def process_audio(audio_input):
        try:
            # Transcribe the audio (long-form support)
            transcription = transcribe_long_audio(audio_input, transcriber, chunk_duration=30)
            # Summarize the transcription
            summary = summarizer(transcription, max_length=50, min_length=10, do_sample=False)[0]["summary_text"]
            return transcription, summary
        except Exception as e:
            return f"Error processing audio: {e}", ""

    # Gradio Interface with Horizontal Layout
    with gr.Blocks() as interface:
        with gr.Row():
            with gr.Column():
                audio_input = gr.Audio(source="microphone", type="numpy", label="Record or Upload Audio")
                process_button = gr.Button("Process Audio")
            with gr.Column():
                transcription_output = gr.Textbox(label="Full Transcription", lines=10)
                summary_output = gr.Textbox(label="Summary", lines=5)

        process_button.click(
            process_audio,
            inputs=[audio_input],
            outputs=[transcription_output, summary_output]
        )

    # Launch the interface with optional public sharing
    interface.launch(share=True)

# Run the main function
if __name__ == "__main__":
    main()