Spaces:

gdnartea
/

Chatty_Ashe

Runtime error

File size: 1,857 Bytes

c5f8e1d
c7e3088
 
 
c5f8e1d
3e0dbc5
c7e3088
 
 
 
 
 
3e0dbc5
 
 
 
c5f8e1d
c7e3088
 
 
c5f8e1d
6719588
 
 
 
 
c7e3088
 
c5f8e1d
 
c7e3088
 
 
c5f8e1d
 
 
c7e3088
c5f8e1d

import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, VitsForConditionalGeneration, VitsProcessor
from nemo.collections.asr.models import ASRModel


# load speech to text model
canary_model = ASRModel.from_pretrained('nvidia/canary-1b')
canary_model.eval()

# update decode params
canary_model.change_decoding_strategy(None)
decode_cfg = canary_model.cfg.decoding
decode_cfg.beam.beam_size = 1
canary_model.change_decoding_strategy(decode_cfg)





# Load the text processing model and tokenizer
proc_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
proc_model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-128k-instruct", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, )
)

# Load the TTS model and processor
tts_processor = VitsProcessor.from_pretrained("facebook/mms-tts-eng")
tts_model = VitsForConditionalGeneration.from_pretrained("facebook/mms-tts-eng")


def process_speech(speech):
    # Convert the speech to text
    transcription = canary_model.transcribe(speech, logprobs=False)

    # Process the text
    inputs = proc_tokenizer.encode(transcription + proc_tokenizer.eos_token, return_tensors='pt')
    outputs = proc_model.generate(inputs, max_length=100, temperature=0.7, pad_token_id=proc_tokenizer.eos_token_id)
    processed_text = proc_tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Convert the processed text to speech
    inputs = tts_processor(processed_text, return_tensors="pt")
    with torch.no_grad():
        logits = tts_model(inputs["input_ids"]).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    audio = tts_processor.decode(predicted_ids)

    return audio

iface = gr.Interface(fn=process_speech, inputs=gr.inputs.Audio(source="microphone"), outputs="audio")

iface.launch()