Spaces:
Runtime error
Runtime error
# imports | |
import gradio as gr | |
import json | |
import librosa | |
import os | |
import soundfile as sf | |
import tempfile | |
import uuid | |
import torch | |
from transformers import AutoTokenizer, VitsModel, set_seed, AutoModelForCausalLM, AutoTokenizer, pipeline | |
from nemo.collections.asr.models import ASRModel | |
from nemo.collections.asr.parts.utils.streaming_utils import FrameBatchMultiTaskAED | |
from nemo.collections.asr.parts.utils.transcribe_utils import get_buffered_pred_feat_multitaskAED | |
torch.random.manual_seed(0) | |
proc_model_name = "microsoft/Phi-3-mini-4k-instruct-gguf" | |
proc_model = AutoModelForCausalLM.from_pretrained(proc_model_name) | |
proc_model.to("cpu") | |
proc_tokenizer = AutoTokenizer.from_pretrained(proc_model_name) | |
SAMPLE_RATE = 16000 # Hz | |
MAX_AUDIO_MINUTES = 10 # wont try to transcribe if longer than this | |
model = ASRModel.from_pretrained("nvidia/canary-1b") | |
model.eval() | |
# make sure beam size always 1 for consistency | |
model.change_decoding_strategy(None) | |
decoding_cfg = model.cfg.decoding | |
decoding_cfg.beam.beam_size = 1 | |
model.change_decoding_strategy(decoding_cfg) | |
amp_dtype = torch.float16 | |
def convert_audio(audio_filepath, tmpdir, utt_id): | |
data, sr = librosa.load(audio_filepath, sr=None, mono=True) | |
duration = librosa.get_duration(y=data, sr=sr) | |
if sr != SAMPLE_RATE: | |
data = librosa.resample(data, orig_sr=sr, target_sr=SAMPLE_RATE) | |
out_filename = os.path.join(tmpdir, utt_id + '.wav') | |
# save output audio | |
sf.write(out_filename, data, SAMPLE_RATE) | |
return out_filename, duration | |
def transcribe(audio_filepath): | |
if audio_filepath is None: | |
raise gr.Error("Please provide some input audio: either upload an audio file or use the microphone") | |
utt_id = uuid.uuid4() | |
with tempfile.TemporaryDirectory() as tmpdir: | |
converted_audio_filepath, duration = convert_audio(audio_filepath, tmpdir, str(utt_id)) | |
# make manifest file and save | |
manifest_data = { | |
"audio_filepath": converted_audio_filepath, | |
"source_lang": "en", | |
"target_lang": "en", | |
"taskname": "asr", | |
"pnc": "no", | |
"answer": "predict", | |
"duration": str(duration), | |
} | |
manifest_filepath = os.path.join(tmpdir, f'{utt_id}.json') | |
with open(manifest_filepath, 'w') as fout: | |
line = json.dumps(manifest_data) | |
fout.write(line + '\n') | |
output_text = model.transcribe(manifest_filepath)[0] | |
return output_text | |
start = {"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."} | |
def generate_response(user_input): | |
messages = [start, {"role": "user", "content": user_input}] | |
inputs = proc_tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = proc_model.generate( | |
inputs, | |
max_new_tokens=48, | |
) | |
response = proc_tokenizer.batch_decode( | |
outputs, | |
skip_special_tokens=True, | |
clean_up_tokenization_spaces=False, | |
)[0] | |
return response | |
def CanaryPhi(audio_filepath): | |
user_input = transcribe(audio_filepath) | |
response = generate_response(user_input) | |
return response | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=CanaryPhi, | |
inputs=gr.Audio(sources="microphone", type="filepath"), | |
outputs=gr.Textbox(), | |
) | |
# Launch the interface | |
iface.launch() | |