import torch import whisper import numpy as np import torchaudio as ta import gradio as gr from model_utils import get_processor, get_model, get_whisper_model_small, get_device from config import SAMPLING_RATE, CHUNK_LENGTH_S import subprocess def resample_with_ffmpeg(input_file, output_file, target_sr=16000): command = [ 'ffmpeg', '-i', input_file, '-ar', str(target_sr), output_file ] subprocess.run(command, check=True) def detect_language(audio): whisper_model = get_whisper_model_small() # Save the input audio to a temporary file ta.save("input_audio.wav", torch.tensor(audio[1]).unsqueeze(0), audio[0]) # Resample if necessary using ffmpeg if audio[0] != SAMPLING_RATE: resample_with_ffmpeg("input_audio.wav", "resampled_audio.wav", target_sr=SAMPLING_RATE) audio_tensor, _ = ta.load("resampled_audio.wav") else: audio_tensor = torch.tensor(audio[1]).float() # Ensure the audio is in the correct shape (mono) if audio_tensor.dim() == 2: audio_tensor = audio_tensor.mean(dim=0) # Use Whisper's preprocessing audio_tensor = whisper.pad_or_trim(audio_tensor) print(f"Audio length after pad/trim: {audio_tensor.shape[-1] / SAMPLING_RATE} seconds") mel = whisper.log_mel_spectrogram(audio_tensor).to(whisper_model.device) # Detect language _, probs = whisper_model.detect_language(mel) detected_lang = max(probs, key=probs.get) print(f"Audio shape: {audio_tensor.shape}") print(f"Mel spectrogram shape: {mel.shape}") print(f"Detected language: {detected_lang}") print("Language probabilities:", probs) return detected_lang def process_long_audio(audio, task="transcribe", language=None): # ... (rest of the function remains the same) def process_audio(audio): if audio is None: return "No file uploaded", "", "" detected_lang = detect_language(audio) transcription = process_long_audio(audio, task="transcribe") translation = process_long_audio(audio, task="translate", language=detected_lang) return detected_lang, transcription, translation # Gradio interface iface = gr.Interface( fn=process_audio, inputs=gr.Audio(), outputs=[ gr.Textbox(label="Detected Language"), gr.Textbox(label="Transcription", lines=5), gr.Textbox(label="Translation", lines=5) ], title="Audio Transcription and Translation", description="Upload an audio file to detect its language, transcribe, and translate it.", allow_flagging="never", css=".output-textbox { font-family: 'Noto Sans Devanagari', sans-serif; font-size: 18px; }" )