import gradio as gr import librosa import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, MBartForConditionalGeneration, MBart50Tokenizer # Load pre-trained model and processor directly from Hugging Face Hub model = Wav2Vec2ForCTC.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija") processor = Wav2Vec2Processor.from_pretrained("boumehdi/wav2vec2-large-xlsr-moroccan-darija") # Load translation model translation_model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") translation_tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="ar") def transcribe_audio(audio): # Load the audio file from Gradio interface audio_array, sr = librosa.load(audio, sr=16000) # Tokenize the audio file input_values = processor(audio_array, return_tensors="pt", padding=True).input_values # Get the model's logits (predicted token scores) logits = model(input_values).logits # Get the predicted tokens tokens = torch.argmax(logits, axis=-1) # Decode the tokens into text (Darija transcription) transcription = processor.decode(tokens[0]) # Translate the transcription to English translation = translate_text(transcription) return transcription, translation def translate_text(text): # Tokenize the text to translate inputs = translation_tokenizer(text, return_tensors="pt") # Generate translated tokens (from Darija to English) translated_tokens = translation_model.generate(**inputs, forced_bos_token_id=translation_tokenizer.lang_code_to_id["en"]) # Decode the translated tokens into text translated_text = translation_tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] return translated_text # Create a Gradio interface for uploading audio or recording from the browser demo = gr.Interface(fn=transcribe_audio, inputs=gr.Audio(type="filepath"), # Corrected input component outputs=["text", "text"], # Both transcription and translation outputs live=True) demo.launch() demo.launch(api=True, share=True)