Spaces:

Mohssinibra
/

STTDARIJAAPI

Running

App Files Files Community

STTDARIJAAPI / app.py

Mohssinibra

gradio

e7d0ead verified 5 months ago

raw

history blame

1.77 kB

	import gradio as gr
	import librosa
	import torch
	from transformers import Wav2Vec2CTCTokenizer, Wav2Vec2ForCTC, Wav2Vec2Processor

	# Load tokenizer, processor, and model
	tokenizer = Wav2Vec2CTCTokenizer("./vocab.json", unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="\|")
	processor = Wav2Vec2Processor.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija', tokenizer=tokenizer)
	model = Wav2Vec2ForCTC.from_pretrained('boumehdi/wav2vec2-large-xlsr-moroccan-darija')

	# Define the function for transcribing audio
	def transcribe(audio):
	# Load the audio data from the Gradio input (audio is in the format of a NumPy array)
	input_audio = audio
	sr = 16000 # Ensure the sample rate is 16000 Hz, which is expected by the model

	# Tokenize the audio
	input_values = processor(input_audio, return_tensors="pt", padding=True).input_values

	# Get the model's logits
	logits = model(input_values).logits

	# Find the predicted tokens
	tokens = torch.argmax(logits, axis=-1)

	# Decode the tokens to text
	transcription = tokenizer.batch_decode(tokens)

	return transcription[0]

	# Create the Gradio interface
	interface = gr.Interface(
	fn=transcribe, # Function to be called when an audio file is uploaded or recorded
	inputs=[
	gr.Audio(source="upload", type="numpy"), # Allow user to upload an audio file
	gr.Audio(source="microphone", type="numpy") # Allow user to record audio from the browser
	],
	outputs="text", # Output will be a transcription text
	title="Moroccan Darija Speech-to-Text", # Interface title
	description="Upload an audio file or record audio directly from your microphone to transcribe it into Moroccan Darija."
	)

	# Launch the interface
	interface.launch()