File size: 2,198 Bytes
2459bb2
 
4206062
4eaea04
2459bb2
4eaea04
4206062
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d9cee8f
a62c4d4
 
4206062
 
2a84333
4206062
 
 
4eaea04
 
 
4206062
 
 
2a84333
ae87c60
 
4206062
2459bb2
4206062
 
 
 
 
2459bb2
 
4206062
56925b6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import gradio as gr
import whisper
import librosa
import numpy as np

# Load Whisper model (using tiny for faster performance)
model = whisper.load_model("tiny")

# Chunking function to split the audio into smaller parts (e.g., 5-second chunks)
def chunk_audio(audio_file, chunk_size=5):
    # Load audio file
    audio, sr = librosa.load(audio_file, sr=16000)
    
    # Determine the number of chunks (in seconds)
    total_duration = len(audio) / sr
    num_chunks = int(total_duration // chunk_size)
    
    # Split the audio into chunks
    audio_chunks = []
    for i in range(num_chunks):
        start = int(i * chunk_size * sr)
        end = int((i + 1) * chunk_size * sr)
        audio_chunks.append(audio[start:end])
    
    # If the last chunk is shorter than chunk_size, append it as well
    if len(audio) % (chunk_size * sr) != 0:
        audio_chunks.append(audio[num_chunks * chunk_size * sr:])
    
    return audio_chunks, sr

# Function to transcribe the audio in chunks using Whisper
def transcribe_audio_in_chunks(audio_file):
    if audio_file is None:
        return "No audio file provided."
    
    # Chunk the audio into 5-second parts
    chunks, sr = chunk_audio(audio_file, chunk_size=5)

    # Process each chunk and append the results as real-time transcription
    transcription = ""
    for i, chunk in enumerate(chunks):
        # Convert the chunk into the correct format for Whisper (numpy array of floats)
        chunk = np.array(chunk)
        
        # Transcribe each chunk
        result = model.transcribe(chunk)
        transcription += f"Chunk {i + 1}: {result['text']}\n"
    
    return transcription

# Gradio interface for real-time transcription with chunking
iface = gr.Interface(
    fn=transcribe_audio_in_chunks,  # Function to process the audio file in chunks
    inputs=gr.Audio(type="filepath"),  # Audio upload, passing file path
    outputs="text",  # Output transcriptions in real-time
    title="Whisper Audio Transcription with Chunking",
    description="Upload an audio file, and Whisper will transcribe it in real-time as chunks."
)

# Launch the Gradio interface with a shareable link (use share=True for Colab)
iface.launch()