Spaces:

aarishshahmohsin
/

tts_iit_roorkee

Configuration error

App Files Files Community

tts_iit_roorkee / app.py

aarishshahmohsin

added everything now

716edce 11 months ago

raw

history blame contribute delete

2.85 kB

	import gradio as gr
	import librosa
	import numpy as np
	import torch
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	from datasets import load_dataset

	# Model configurations
	models = {
	"Urdu Model": {
	"checkpoint": "aarishshahmohsin/final_urdu_t5_finetuned",
	"vocoder": "microsoft/speecht5_hifigan",
	"processor": "aarishshahmohsin/urdu_processor_t5",
	},
	"Technical Model": {
	"checkpoint": "aarishshahmohsin/final_technical_terms_t5_finetuned",
	"vocoder": "microsoft/speecht5_hifigan",
	"processor": "microsoft/speecht5_tts", # Using same checkpoint for processor
	}
	}

	embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
	speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


	# Initialize all models at startup
	print("Loading models...")
	loaded_models = {}
	for model_name, config in models.items():
	processor = SpeechT5Processor.from_pretrained(config["processor"])
	model = SpeechT5ForTextToSpeech.from_pretrained(config["checkpoint"])
	vocoder = SpeechT5HifiGan.from_pretrained(config["vocoder"])

	loaded_models[model_name] = {
	"processor": processor,
	"model": model,
	"vocoder": vocoder
	}
	print("Models loaded successfully!")

	def predict(text, model_name):
	if len(text.strip()) == 0:
	return (16000, np.zeros(0).astype(np.int16))

	model_components = loaded_models[model_name]
	processor = model_components["processor"]
	model = model_components["model"]
	vocoder = model_components["vocoder"]

	inputs = processor(text=text, return_tensors="pt")
	speech = model.generate_speech(inputs["input_ids"], speaker_embedding, vocoder=vocoder)
	speech = (speech.numpy() * 32767).astype(np.int16)

	return (16000, speech)

	# UI Configuration
	title = "Multi-Model SpeechT5 Demo"

	examples = [
	# Urdu Model Examples
	["میں نے آج بہت کام کیا۔", "Urdu Model"],
	["آپ کا دن کیسا گزرا؟", "Urdu Model"],

	# Technical Model Examples
	["JSON response with HTTP status code 200.", "Technical Model"],
	["Nginx is the best", "Technical Model"],
	]

	description = """
	Select a model and enter text to generate speech.

	1. Regional Language(Urdu)
	2. Technical Speech

	"""

	# Create and launch the interface
	gr.Interface(
	fn=predict,
	inputs=[
	gr.Text(label="Input Text"),
	gr.Dropdown(
	choices=list(models.keys()),
	label="Select Model",
	value="Technical Model"
	)
	],
	outputs=[
	gr.Audio(label="Generated Speech", type="numpy"),
	],
	title=title,
	description=description,
	examples=examples, # Add examples to the interface
	cache_examples=True,
	).launch()