import gradio as gr import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, MBartForConditionalGeneration, MBart50Tokenizer # Load pre-trained models model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija") processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija") translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") translation_tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="ar_AR") def transcribe_audio(audio): audio_array, sr = librosa.load(audio, sr=16000) input_values = processor(audio_array, return_tensors="pt", padding=True).input_values logits = model(input_values).logits tokens = torch.argmax(logits, axis=-1) transcription = processor.decode(tokens[0]) translation = translate_text(transcription) return transcription, translation def translate_text(text): inputs = translation_tokenizer(text, return_tensors="pt") translated_tokens = translation_model.generate(**inputs, forced_bos_token_id=translation_tokenizer.lang_code_to_id["en_XX"]) translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] return translated_text with gr.Blocks() as demo: gr.Markdown("# Speech-to-Text and Translation") audio_input = gr.Audio(type="filepath") submit_button = gr.Button("Submit") transcription_output = gr.Textbox(label="Transcription") translation_output = gr.Textbox(label="Translation") submit_button.click(transcribe_audio, inputs=[audio_input], outputs=[transcription_output, translation_output]) demo.launch()