Aumkeshchy2003 commited on
Commit
feda536
·
verified ·
1 Parent(s): 67b9f6b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -123
app.py CHANGED
@@ -9,131 +9,12 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
9
  from speechbrain.pretrained import EncoderClassifier
10
  from datasets import load_dataset
11
 
12
- device = "cuda" if torch.cuda.is_available() else "cpu"
 
13
 
14
- def load_models_and_data():
15
- model_name = "microsoft/speecht5_tts"
16
- processor = SpeechT5Processor.from_pretrained(model_name)
17
- model = SpeechT5ForTextToSpeech.from_pretrained("speecht5_finetuned_Aumkesh_tr").to(device)
18
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
-
20
- spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
21
- speaker_model = EncoderClassifier.from_hparams(
22
- source=spk_model_name,
23
- run_opts={"device": device},
24
- savedir=os.path.join("/tmp", spk_model_name),
25
- )
26
-
27
- # Load a sample from a dataset for default embedding
28
- dataset = load_dataset("Yassmen/TTS_English_Technical_data", split="train")
29
- example = dataset[304]
30
-
31
- return model, processor, vocoder, speaker_model, example
32
 
33
- model, processor, vocoder, speaker_model, default_example = load_models_and_data()
34
-
35
- def create_speaker_embedding(waveform):
36
- with torch.no_grad():
37
- speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
38
- speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
39
- speaker_embeddings = speaker_embeddings.squeeze()
40
- return speaker_embeddings
41
-
42
- def prepare_default_embedding(example):
43
- audio = example["audio"]
44
- return create_speaker_embedding(audio["array"])
45
-
46
- default_embedding = prepare_default_embedding(default_example)
47
-
48
- replacements = [
49
- ("@", "at the rate"),
50
- ("$", "dollar")
51
- ]
52
-
53
- number_words = {
54
- 0: "zero", 1: "one", 2: "two", 3: "tree", 4: "four", 5: "five", 6: "six", 7: "seven", 8: "eight", 9: "nine",
55
- 10: "ten", 11: "eleven", 12: "twelve", 13: "thirteen", 14: "fourteen", 15: "fifteen", 16: "sixteen", 17: "seventeen",
56
- 18: "eighteen", 19: "nineteen", 20: "tweenty", 30: "thirty", 40: "forty", 50: "fifty", 60: "sixty", 70: "seventy",
57
- 80: "eighty", 90: "ninty", 100: "hundred", 1000: "thousand"
58
- }
59
-
60
- def number_to_words(number):
61
- if number < 20:
62
- return number_words[number]
63
- elif number < 100:
64
- tens, unit = divmod(number, 10)
65
- return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
66
- elif number < 1000:
67
- hundreds, remainder = divmod(number, 100)
68
- return (number_words[hundreds] + " hundred" if hundreds > 1 else " hundred") + (" " + number_to_words(remainder) if remainder else "")
69
- elif number < 1000000:
70
- thousands, remainder = divmod(number, 1000)
71
- return (number_to_words(thousands) + " thousand" if thousands > 1 else " thousand") + (" " + number_to_words(remainder) if remainder else "")
72
- elif number < 1000000000:
73
- millions, remainder = divmod(number, 1000000)
74
- return number_to_words(millions) + " million" + (" " + number_to_words(remainder) if remainder else "")
75
- elif number < 1000000000000:
76
- billions, remainder = divmod(number, 1000000000)
77
- return number_to_words(billions) + " billion" + (" " + number_to_words(remainder) if remainder else "")
78
- else:
79
- return str(number)
80
-
81
- def replace_numbers_with_words(text):
82
-
83
- def replace(match):
84
- number = int(match.group())
85
- return number_to_words(number)
86
-
87
- # Find the numbers and change with words.
88
- result = re.sub(r'\b\d+\b', replace, text)
89
-
90
- return result
91
-
92
- def replace_numbers_with_words(text):
93
- def replace(match):
94
- number = int(match.group())
95
- return number_to_words(number)
96
-
97
- # Find the numbers and change with words.
98
- result = re.sub(r'\b\d+\b', replace, text)
99
-
100
- return result
101
-
102
- def normalize_text(text):
103
- # Convert to lowercase
104
- text = text.lower()
105
-
106
- # Replace numbers with words
107
- text = replace_numbers_with_words(text)
108
-
109
- # Apply character replacements
110
- for old, new in replacements:
111
- text = text.replace(old, new)
112
-
113
- # Remove punctuation
114
- text = re.sub(r'[^\w\s]', '', text)
115
-
116
- return text
117
-
118
- @spaces.GPU(duration=60)
119
- def text_to_speech(text, audio_file=None):
120
- # Normalize the input text
121
- normalized_text = normalize_text(text)
122
-
123
- # Prepare the input for the model
124
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
125
-
126
- # Use the default speaker embedding
127
- speaker_embeddings = default_embedding
128
-
129
- # Generate speech
130
- with torch.no_grad():
131
- speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
132
-
133
- speech_np = speech.cpu().numpy()
134
-
135
- return (16000, speech_np)
136
-
137
  iface = gr.Interface(
138
  fn=text_to_speech,
139
  inputs=[
 
9
  from speechbrain.pretrained import EncoderClassifier
10
  from datasets import load_dataset
11
 
12
+ # Load model directly
13
+ from transformers import AutoProcessor, AutoModelForTextToSpectrogram
14
 
15
+ processor = AutoProcessor.from_pretrained("Aumkeshchy2003/speecht5_finetuned_Aumkesh_tr")
16
+ model = AutoModelForTextToSpectrogram.from_pretrained("Aumkeshchy2003/speecht5_finetuned_Aumkesh_tr")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  iface = gr.Interface(
19
  fn=text_to_speech,
20
  inputs=[