Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -13,9 +13,6 @@ from nemo.collections.asr.models import ASRModel
|
|
13 |
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
|
14 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
15 |
|
16 |
-
import tracemalloc as tm
|
17 |
-
|
18 |
-
tm.start()
|
19 |
|
20 |
torch.random.manual_seed(0)
|
21 |
proc_model_name = "microsoft/Phi-3-mini-4k-instruct"
|
@@ -29,9 +26,6 @@ proc_model = AutoModelForCausalLM.from_pretrained(
|
|
29 |
proc_model.to("cpu")
|
30 |
proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
|
31 |
|
32 |
-
print(tm.get_traced_memory())
|
33 |
-
tm.stop()
|
34 |
-
|
35 |
|
36 |
SAMPLE_RATE = 16000 # Hz
|
37 |
MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
|
@@ -45,16 +39,12 @@ decoding_cfg = model.cfg.decoding
|
|
45 |
decoding_cfg.beam.beam_size = 1
|
46 |
model.change_decoding_strategy(decoding_cfg)
|
47 |
|
48 |
-
print(tm.get_traced_memory())
|
49 |
-
|
50 |
|
51 |
|
52 |
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
53 |
vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
|
54 |
set_seed(555)
|
55 |
|
56 |
-
print(tm.get_traced_memory())
|
57 |
-
tm.stop()
|
58 |
|
59 |
def text_to_speech(text_response):
|
60 |
inputs = vits_tokenizer(text=text_response, return_tensors="pt")
|
@@ -69,7 +59,6 @@ def text_to_speech(text_response):
|
|
69 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
70 |
|
71 |
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
|
72 |
-
|
73 |
duration = librosa.get_duration(y=data, sr=sr)
|
74 |
|
75 |
if sr != SAMPLE_RATE:
|
@@ -79,7 +68,6 @@ def convert_audio(audio_filepath, tmpdir, utt_id):
|
|
79 |
|
80 |
# save output audio
|
81 |
sf.write(out_filename, data, SAMPLE_RATE)
|
82 |
-
|
83 |
return out_filename, duration
|
84 |
|
85 |
def transcribe(audio_filepath):
|
@@ -125,8 +113,6 @@ def generate_response(user_input):
|
|
125 |
add_generation_prompt=True,
|
126 |
return_tensors="pt",
|
127 |
)
|
128 |
-
|
129 |
-
|
130 |
|
131 |
with torch.no_grad():
|
132 |
outputs = proc_model.generate(
|
@@ -142,19 +128,23 @@ def generate_response(user_input):
|
|
142 |
|
143 |
return response
|
144 |
|
145 |
-
def
|
146 |
-
user_input = transcribe(
|
147 |
print(user_input)
|
148 |
response = generate_response(user_input)
|
149 |
print(response)
|
150 |
chatty_response = text_to_speech(response)
|
|
|
|
|
|
|
|
|
151 |
return chatty_response
|
152 |
|
153 |
|
154 |
# Create a Gradio interface
|
155 |
iface = gr.Interface(
|
156 |
-
fn=
|
157 |
-
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
|
158 |
#inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
|
159 |
#outputs=gr.Textbox(),
|
160 |
outputs=gr.Audio("response.wav"),
|
|
|
13 |
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
|
14 |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
|
15 |
|
|
|
|
|
|
|
16 |
|
17 |
torch.random.manual_seed(0)
|
18 |
proc_model_name = "microsoft/Phi-3-mini-4k-instruct"
|
|
|
26 |
proc_model.to("cpu")
|
27 |
proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
|
28 |
|
|
|
|
|
|
|
29 |
|
30 |
SAMPLE_RATE = 16000 # Hz
|
31 |
MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
|
|
|
39 |
decoding_cfg.beam.beam_size = 1
|
40 |
model.change_decoding_strategy(decoding_cfg)
|
41 |
|
|
|
|
|
42 |
|
43 |
|
44 |
vits_model = VitsModel.from_pretrained("facebook/mms-tts-eng")
|
45 |
vits_tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
|
46 |
set_seed(555)
|
47 |
|
|
|
|
|
48 |
|
49 |
def text_to_speech(text_response):
|
50 |
inputs = vits_tokenizer(text=text_response, return_tensors="pt")
|
|
|
59 |
def convert_audio(audio_filepath, tmpdir, utt_id):
|
60 |
|
61 |
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
|
|
|
62 |
duration = librosa.get_duration(y=data, sr=sr)
|
63 |
|
64 |
if sr != SAMPLE_RATE:
|
|
|
68 |
|
69 |
# save output audio
|
70 |
sf.write(out_filename, data, SAMPLE_RATE)
|
|
|
71 |
return out_filename, duration
|
72 |
|
73 |
def transcribe(audio_filepath):
|
|
|
113 |
add_generation_prompt=True,
|
114 |
return_tensors="pt",
|
115 |
)
|
|
|
|
|
116 |
|
117 |
with torch.no_grad():
|
118 |
outputs = proc_model.generate(
|
|
|
128 |
|
129 |
return response
|
130 |
|
131 |
+
def CanaryPhiVits(user_voice):
|
132 |
+
user_input = transcribe(user_voice)
|
133 |
print(user_input)
|
134 |
response = generate_response(user_input)
|
135 |
print(response)
|
136 |
chatty_response = text_to_speech(response)
|
137 |
+
|
138 |
+
if chatty_response.startswith(user_input):
|
139 |
+
chatty_response = chatty_response.replace(user_input, '', 1)
|
140 |
+
|
141 |
return chatty_response
|
142 |
|
143 |
|
144 |
# Create a Gradio interface
|
145 |
iface = gr.Interface(
|
146 |
+
fn=CanaryPhiVits,
|
147 |
+
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", format="wav",),
|
148 |
#inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
|
149 |
#outputs=gr.Textbox(),
|
150 |
outputs=gr.Audio("response.wav"),
|