File size: 4,841 Bytes
c569b48
64f2bf5
 
aaac499
 
 
 
 
9148c64
aaac499
64f2bf5
 
 
 
9148c64
 
aaac499
9148c64
 
64f2bf5
aaac499
64f2bf5
aaac499
 
81e4ee2
 
 
 
 
f36e52e
aaac499
 
81e4ee2
aaac499
 
 
 
 
 
 
 
 
 
 
64f2bf5
aaac499
9148c64
 
 
aaac499
 
64f2bf5
aaac499
9148c64
 
 
 
 
 
aaac499
64f2bf5
aaac499
 
 
 
 
64f2bf5
aaac499
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f36e52e
81e4ee2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from audio_processing import process_audio, print_results, load_models
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForQuestionAnswering
import spaces
import torch

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
device = "cuda" if cuda_available else "cpu"

# Load models globally
print("Loading models...")
load_models()  # Load Whisper and diarization models

summarizer_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn").to(device)
summarizer_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

qa_model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad").to(device)
qa_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
print("Models loaded successfully.")

@spaces.GPU
def transcribe_audio(audio_file, translate, model_size):
    language_segments, final_segments = process_audio(audio_file, translate=translate, model_size=model_size)
    
    output = "Detected language changes:\n\n"
    for segment in language_segments:
        output += f"Language: {segment['language']}\n"
        output += f"Time: {segment['start']:.2f}s - {segment['end']:.2f}s\n\n"

    output += f"Transcription with language detection and speaker diarization (using {model_size} model):\n\n"
    full_text = ""
    for segment in final_segments:
        output += f"[{segment['start']:.2f}s - {segment['end']:.2f}s] ({segment['language']}) {segment['speaker']}:\n"
        output += f"Original: {segment['text']}\n"
        if translate:
            output += f"Translated: {segment['translated']}\n"
            full_text += segment['translated'] + " "
        else:
            full_text += segment['text'] + " "
        output += "\n"
    
    return output, full_text

@spaces.GPU
def summarize_text(text):
    inputs = summarizer_tokenizer(text, max_length=1024, truncation=True, return_tensors="pt").to(device)
    summary_ids = summarizer_model.generate(inputs["input_ids"], max_length=150, min_length=50, do_sample=False)
    summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

@spaces.GPU
def answer_question(context, question):
    inputs = qa_tokenizer(question, context, return_tensors="pt").to(device)
    outputs = qa_model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1
    answer = qa_tokenizer.decode(inputs["input_ids"][0][answer_start:answer_end])
    return answer

@spaces.GPU
def process_and_summarize(audio_file, translate, model_size):
    transcription, full_text = transcribe_audio(audio_file, translate, model_size)
    summary = summarize_text(full_text)
    return transcription, summary

@spaces.GPU
def qa_interface(audio_file, translate, model_size, question):
    _, full_text = transcribe_audio(audio_file, translate, model_size)
    answer = answer_question(full_text, question)
    return answer

# Main interface
with gr.Blocks() as iface:
    gr.Markdown("# WhisperX Audio Transcription, Translation, Summarization, and QA (with ZeroGPU support)")
    
    with gr.Tab("Transcribe and Summarize"):
        audio_input = gr.Audio(type="filepath")
        translate_checkbox = gr.Checkbox(label="Enable Translation")
        model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small")
        transcribe_button = gr.Button("Transcribe and Summarize")
        transcription_output = gr.Textbox(label="Transcription")
        summary_output = gr.Textbox(label="Summary")
        
        transcribe_button.click(
            process_and_summarize,
            inputs=[audio_input, translate_checkbox, model_dropdown],
            outputs=[transcription_output, summary_output]
        )
    
    with gr.Tab("Question Answering"):
        qa_audio_input = gr.Audio(type="filepath")
        qa_translate_checkbox = gr.Checkbox(label="Enable Translation")
        qa_model_dropdown = gr.Dropdown(choices=["tiny", "base", "small", "medium", "large", "large-v2", "large-v3"], label="Whisper Model Size", value="small")
        question_input = gr.Textbox(label="Ask a question about the audio")
        qa_button = gr.Button("Get Answer")
        answer_output = gr.Textbox(label="Answer")
        
        qa_button.click(
            qa_interface,
            inputs=[qa_audio_input, qa_translate_checkbox, qa_model_dropdown, question_input],
            outputs=answer_output
        )

    gr.Markdown(
        """
        ## ZeroGPU Support
        This application supports ZeroGPU for Hugging Face Spaces pro users. 
        GPU-intensive tasks are automatically optimized for better performance.
        """
    )

iface.launch()