MMS-ASR-Fula / app.py
yaya-sy's picture
Update app.py
19f28fb
raw
history blame
1.94 kB
import gradio as gr
from transformers import Wav2Vec2ForCTC, AutoProcessor, pipeline
from optimum.bettertransformer import BetterTransformer
import torch
import librosa
import json
with open('ISO_codes.json', 'r') as file:
iso_codes = json.load(file)
languages = list(iso_codes.keys())
model_id = "cawoylel/windanam_mms-1b-tts_v2"
processor = AutoProcessor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)
model = BetterTransformer.transform(model)
pipe = pipeline("automatic-speech-recognition", model=model)
def transcribe(audio_file_mic=None, audio_file_upload=None):
if audio_file_mic:
audio_file = audio_file_mic
elif audio_file_upload:
audio_file = audio_file_upload
else:
return "Please upload an audio file or record one"
# Make sure audio is 16kHz
speech, sample_rate = librosa.load(audio_file)
if sample_rate != 16000:
speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
return pipe(audio_file)["text"]
description = '''Automatic Speech Recognition with [MMS](https://ai.facebook.com/blog/multilingual-model-speech-recognition/) (Massively Multilingual Speech) by Meta.
Supports [1162 languages](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html). Read the paper for more details: [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516).'''
iface = gr.Interface(fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", label="Record Audio"),
gr.Audio(source="upload", type="filepath", label="Upload Audio"),
gr.Dropdown(choices=languages, label="Language", value="English (eng)")
],
outputs=gr.Textbox(label="Transcription"),
description=description
)
iface.launch()