Spaces:
Runtime error
Runtime error
File size: 4,589 Bytes
c27a4b2 6ca34cf c27a4b2 4c8ddf5 c27a4b2 b89d60e d94780b c27a4b2 520cee1 dbabac6 1ab812e ec111bc 79d302d fb0a016 f65ce4d ec111bc 79d302d 1ab812e c27a4b2 9dc2324 8bbed7c 2e41b4d d4a4a14 34212d7 197f7f7 34212d7 c27a4b2 34212d7 c27a4b2 e983f3f c27a4b2 520cee1 e983f3f c27a4b2 197f7f7 c27a4b2 34212d7 98783be 34212d7 98783be c27a4b2 2409215 c27a4b2 197f7f7 c27a4b2 16fd594 c4d418c 2409215 2ea68c5 f65ce4d 05027ec 2409215 f65ce4d 197f7f7 c4d418c 9dc2324 16fd594 f0b7b32 0866bfe dedd577 c4d418c 2409215 4ade773 2409215 4ade773 2409215 b09cd28 0866bfe 197f7f7 c27a4b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# imports
import gradio as gr
import json
import librosa
import os
import soundfile as sf
import tempfile
import uuid
import torch
from transformers import AutoTokenizer, VitsModel, set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline
from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
import time
torch.random.manual_seed(0)
proc_model_name = "microsoft/Phi-3-mini-4k-instruct"
proc_model = AutoModelForCausalLM.from_pretrained(
proc_model_name,
torch_dtype=torch.float16,
trust_remote_code=True,
attn_implementation='eager',
revision='300945e90b6f55d3cb88261c8e5333fae696f672',
)
proc_model.to("cpu")
proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)
SAMPLE_RATE = 16000 # Hz
MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this
model = ASRModel.from_pretrained("nvidia/canary-1b")
model.eval()
# make sure beam size always 1 for consistency
model.change_decoding_strategy(None)
decoding_cfg = model.cfg.decoding
decoding_cfg.beam.beam_size = 1
model.change_decoding_strategy(decoding_cfg)
vits_model_name = "facebook/mms-tts-eng"
vits_model = VitsModel.from_pretrained(vits_model_name)
vits_tokenizer = AutoTokenizer.from_pretrained(vits_model_name)
set_seed(555)
def text_to_speech(text_response):
inputs = vits_tokenizer(text=text_response, return_tensors="pt")
with torch.no_grad():
outputs = vits_model(**inputs)
waveform = outputs.waveform[0]
sf.write('output.wav', waveform.numpy(), vits_model.config.sampling_rate)
return 'output.wav'
def convert_audio(audio_filepath, tmpdir, utt_id):
data, sr = librosa.load(audio_filepath, sr=None, mono=True)
duration = librosa.get_duration(y=data, sr=sr)
if sr != SAMPLE_RATE:
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)
out_filename = os.path.join(tmpdir, utt_id + '.wav')
# save output audio
sf.write(out_filename, data, SAMPLE_RATE)
return out_filename, duration
def transcribe(audio_filepath):
print(audio_filepath)
time.sleep(2)
if audio_filepath is None:
raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone. \nIf the microphone already has audio, please wait a few moments for it to upload properly")
utt_id = uuid.uuid4()
with tempfile.TemporaryDirectory() as tmpdir:
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
# make manifest file and save
manifest_data = {
"audio_filepath": converted_audio_filepath,
"source_lang": "en",
"target_lang": "en",
"taskname": "asr",
"pnc": "yes",
"answer": "predict",
"duration": str(duration),
}
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')
with open(manifest_filepath, 'w') as fout:
line = json.dumps(manifest_data)
fout.write(line + '\n')
output_text = model.transcribe(manifest_filepath)[0]
return output_text
start = {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}
def generate_response(user_input):
messages = [start, {"role": "user", "content": user_input}]
inputs = proc_tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
)
with torch.no_grad():
outputs = proc_model.generate(
inputs,
max_new_tokens=100,
)
response = proc_tokenizer.batch_decode(
outputs,
skip_special_tokens=True,
clean_up_tokenization_spaces=False,
)[0]
return response
def CanaryPhiVits(user_voice):
user_input = transcribe(user_voice)
print("user_input:")
print(user_input)
response = generate_response(user_input)
if response.startswith(user_input):
response = response.replace(user_input, '', 1)
print("chatty_response:")
print(response)
chatty_response = text_to_speech(response)
return chatty_response
# Create a Gradio interface
iface = gr.Interface(
fn=CanaryPhiVits,
title="Chatty Ashe",
#theme="gstaff/xkcd",
inputs=gr.Audio(
sources=["microphone", "upload"],
label="Input Audio",
type="filepath",
format="wav",
),
outputs=gr.Audio(
label="Output Audio"
),
)
# Launch the interface
iface.queue()
iface.launch()
|