Aumkeshchy2003 commited on
Commit
2600936
·
verified ·
1 Parent(s): feda536

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -19
app.py CHANGED
@@ -1,29 +1,54 @@
1
  import gradio as gr
 
 
 
2
  import torch
3
  import soundfile as sf
4
- import spaces
5
  import os
6
- import numpy as np
7
- import re
8
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
- from speechbrain.pretrained import EncoderClassifier
10
- from datasets import load_dataset
11
 
12
- # Load model directly
13
- from transformers import AutoProcessor, AutoModelForTextToSpectrogram
 
 
14
 
15
- processor = AutoProcessor.from_pretrained("Aumkeshchy2003/speecht5_finetuned_Aumkesh_tr")
16
- model = AutoModelForTextToSpectrogram.from_pretrained("Aumkeshchy2003/speecht5_finetuned_Aumkesh_tr")
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  iface = gr.Interface(
19
  fn=text_to_speech,
20
- inputs=[
21
- gr.Textbox(label="Enter English text to convert to speech")
22
- ],
23
- outputs=[
24
- gr.Audio(label="Generated Speech", type="numpy")
25
- ],
26
- title="English SpeechT5 Text-to-Speech Demo",
27
- description="Enter English text, and listen to the generated speech."
28
  )
29
- iface.launch(share=True)
 
 
 
1
  import gradio as gr
2
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
3
+ from transformers import AutoProcessor, AutoModelForTextToSpectrogram
4
+ from datasets import load_dataset
5
  import torch
6
  import soundfile as sf
 
7
  import os
 
 
 
 
 
8
 
9
+ # Load models and processors
10
+ processor = AutoProcessor.from_pretrained("speecht5_finetuned_Aumkesh_tr")
11
+ model = AutoModelForTextToSpectrogram.from_pretrained("speecht5_finetuned_Aumkesh_tr")
12
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
13
 
14
+ # Load xvector containing speaker's voice characteristics from a dataset
15
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
16
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
17
 
18
+ # Quantize the models
19
+ def quantize_model(model):
20
+ quantized_model = torch.quantization.quantize_dynamic(
21
+ model, {torch.nn.Linear}, dtype=torch.qint8
22
+ )
23
+ return quantized_model
24
+
25
+ # Only quantize the vocoder, as the main model might not be compatible
26
+ vocoder = quantize_model(vocoder)
27
+
28
+ # Move models to GPU if available
29
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
30
+ model = model.to(device)
31
+ vocoder = vocoder.to(device)
32
+ speaker_embeddings = speaker_embeddings.to(device)
33
+
34
+ # Use inference mode for faster computation
35
+ @torch.inference_mode()
36
+ def text_to_speech(text):
37
+ inputs = processor(text=text, return_tensors="pt").to(device)
38
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
39
+ speech = speech.cpu() # Move back to CPU for saving
40
+ output_path = "output.wav"
41
+ sf.write(output_path, speech.numpy(), samplerate=16000)
42
+ return output_path
43
+
44
+ # Create Gradio interface
45
  iface = gr.Interface(
46
  fn=text_to_speech,
47
+ inputs=gr.Textbox(label="Enter the text"),
48
+ outputs=gr.Audio(label="Generated Speech"),
49
+ title="Text-to-Speech Converter",
50
+ description="Convert text to speech using the SpeechT5 model."
 
 
 
 
51
  )
52
+
53
+ # Launch the app
54
+ iface.launch()