File size: 4,589 Bytes
c27a4b2
6ca34cf
c27a4b2
 
 
4c8ddf5
c27a4b2
 
 
b89d60e
d94780b
c27a4b2
 
 
520cee1
dbabac6
1ab812e
ec111bc
79d302d
 
fb0a016
f65ce4d
ec111bc
 
79d302d
 
1ab812e
 
c27a4b2
 
 
 
 
 
 
 
 
 
 
 
 
 
9dc2324
8bbed7c
2e41b4d
d4a4a14
34212d7
197f7f7
34212d7
 
 
 
 
 
 
c27a4b2
34212d7
c27a4b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e983f3f
 
c27a4b2
520cee1
e983f3f
 
c27a4b2
 
 
 
 
 
 
 
 
 
197f7f7
c27a4b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34212d7
98783be
34212d7
 
 
98783be
 
c27a4b2
 
 
2409215
c27a4b2
 
 
 
197f7f7
c27a4b2
 
 
 
16fd594
c4d418c
 
2409215
 
2ea68c5
f65ce4d
05027ec
2409215
f65ce4d
197f7f7
c4d418c
9dc2324
16fd594
f0b7b32
0866bfe
dedd577
c4d418c
2409215
4ade773
2409215
 
 
 
 
 
 
4ade773
2409215
 
b09cd28
 
0866bfe
197f7f7
c27a4b2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# imports
import gradio as gr
import json
import librosa
import os
import soundfile as sf
import tempfile
import uuid
import torch
from transformers import AutoTokenizer, VitsModel, set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline

from nemo.collections.asr.models import ASRModel
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED
import time

torch.random.manual_seed(0)
proc_model_name = "microsoft/Phi-3-mini-4k-instruct"
proc_model = AutoModelForCausalLM.from_pretrained(
    proc_model_name,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    attn_implementation='eager',
    revision='300945e90b6f55d3cb88261c8e5333fae696f672',
)

proc_model.to("cpu")
proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name)


SAMPLE_RATE = 16000 # Hz
MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this

model = ASRModel.from_pretrained("nvidia/canary-1b")
model.eval()

# make sure beam size always 1 for consistency
model.change_decoding_strategy(None)
decoding_cfg = model.cfg.decoding
decoding_cfg.beam.beam_size = 1
model.change_decoding_strategy(decoding_cfg)


vits_model_name = "facebook/mms-tts-eng"
vits_model = VitsModel.from_pretrained(vits_model_name)
vits_tokenizer = AutoTokenizer.from_pretrained(vits_model_name)
set_seed(555)    


def text_to_speech(text_response):
    inputs = vits_tokenizer(text=text_response, return_tensors="pt")
    with torch.no_grad():
        outputs = vits_model(**inputs)
    waveform = outputs.waveform[0]
    sf.write('output.wav', waveform.numpy(), vits_model.config.sampling_rate)

    return 'output.wav'


def convert_audio(audio_filepath, tmpdir, utt_id):

	data, sr = librosa.load(audio_filepath, sr=None, mono=True)
	duration = librosa.get_duration(y=data, sr=sr)

	if sr != SAMPLE_RATE:
		data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE)

	out_filename = os.path.join(tmpdir, utt_id + '.wav')

	# save output audio
	sf.write(out_filename, data, SAMPLE_RATE)
	return out_filename, duration

def transcribe(audio_filepath):

	print(audio_filepath)
	time.sleep(2)
	if audio_filepath is None:
		raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone. \nIf the microphone already has audio, please wait a few moments for it to upload properly")

    
	utt_id = uuid.uuid4()
	with tempfile.TemporaryDirectory() as tmpdir:
		converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id))
        
		# make manifest file and save
		manifest_data = {
			"audio_filepath": converted_audio_filepath,
			"source_lang": "en",
			"target_lang": "en",
			"taskname": "asr",
			"pnc": "yes",
			"answer": "predict",
			"duration": str(duration),
		}

		manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json')

		with open(manifest_filepath, 'w') as fout:
			line = json.dumps(manifest_data)
			fout.write(line + '\n')

		output_text = model.transcribe(manifest_filepath)[0]
		
	return output_text



start = {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."}

def generate_response(user_input):  
    messages = [start, {"role": "user", "content": user_input}]
    inputs = proc_tokenizer.apply_chat_template(
        messages, 
        add_generation_prompt=True, 
        return_tensors="pt",
    )
          
    with torch.no_grad():
        outputs = proc_model.generate(
            inputs, 
            max_new_tokens=100,
        )
    
    response = proc_tokenizer.batch_decode(
        outputs, 
        skip_special_tokens=True, 
        clean_up_tokenization_spaces=False,
    )[0]
    
    return response
    
def CanaryPhiVits(user_voice):
    user_input = transcribe(user_voice)
    print("user_input:")
    print(user_input)    
    response = generate_response(user_input)
    if response.startswith(user_input):
        response = response.replace(user_input, '', 1)
    print("chatty_response:")
    print(response)
    chatty_response = text_to_speech(response)
    
    return chatty_response


# Create a Gradio interface
iface = gr.Interface(
    fn=CanaryPhiVits, 
    title="Chatty Ashe",
    #theme="gstaff/xkcd",
    
    inputs=gr.Audio(
        sources=["microphone", "upload"], 
        label="Input Audio",
        type="filepath", 
        format="wav",
    ), 
    outputs=gr.Audio( 
        label="Output Audio"
    ),
)

# Launch the interface
iface.queue()
iface.launch()