Spaces:

coqui
/

voice-chat-with-mistral

Paused

App Files Files Community

ggoknar commited on Oct 17, 2023

Commit

a38b58d

1 Parent(s): da4b074

stream voice with combined wav at end, optional direct stream

Browse files

Files changed (1) hide show

app.py +454 -0

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import os
 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 os.environ["COQUI_TOS_AGREED"] = "1"
 import gradio as gr
 import numpy as np
 import torch
@@ -32,6 +34,9 @@ from TTS.utils.generic_utils import get_user_data_dir
 # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
 AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 1))
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V1")
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
@@ -106,3 +111,452 @@ text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
 )

 # By using XTTS you agree to CPML license https://coqui.ai/cpml
 os.environ["COQUI_TOS_AGREED"] = "1"
+from scipy.io.wavfile import write
+from pydub import AudioSegment
 import gradio as gr
 import numpy as np
 import torch
 # Could not make play audio next work seemlesly on current Gradio with autoplay so this is a workaround
 AUDIO_WAIT_MODIFIER = float(os.environ.get("AUDIO_WAIT_MODIFIER", 1))
+# if set will try to stream audio while receveng audio chunks, beware that recreating audio each time produces artifacts
+DIRECT_STREAM = int(os.environ.get("DIRECT_STREAM", 0))
 # This will trigger downloading model
 print("Downloading if not downloaded Coqui XTTS V1")
 tts = TTS("tts_models/multilingual/multi-dataset/xtts_v1")
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
 )
+###### COQUI TTS FUNCTIONS ######
+def get_latents(speaker_wav):
+    # create as function as we can populate here with voice cleanup/filtering
+    (
+        gpt_cond_latent,
+        diffusion_conditioning,
+        speaker_embedding,
+    ) = model.get_conditioning_latents(audio_path=speaker_wav)
+    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
+def format_prompt(message, history):
+    prompt = (
+        "<s>[INST]"
+        + system_message
+        + "[/INST] I understand, I am a Mistral chatbot with speech by Coqui team.</s>"
+    )
+    for user_prompt, bot_response in history:
+        prompt += f"[INST] {user_prompt} [/INST]"
+        prompt += f" {bot_response}</s> "
+    prompt += f"[INST] {message} [/INST]"
+    return prompt
+def generate(
+    prompt,
+    history,
+    temperature=0.9,
+    max_new_tokens=256,
+    top_p=0.95,
+    repetition_penalty=1.0,
+):
+    temperature = float(temperature)
+    if temperature < 1e-2:
+        temperature = 1e-2
+    top_p = float(top_p)
+    generate_kwargs = dict(
+        temperature=temperature,
+        max_new_tokens=max_new_tokens,
+        top_p=top_p,
+        repetition_penalty=repetition_penalty,
+        do_sample=True,
+        seed=42,
+    )
+    formatted_prompt = format_prompt(prompt, history)
+    try:
+        stream = text_client.text_generation(
+            formatted_prompt,
+            **generate_kwargs,
+            stream=True,
+            details=True,
+            return_full_text=False,
+        )
+        output = ""
+        for response in stream:
+            output += response.token.text
+            yield output
+    except Exception as e:
+        if "Too Many Requests" in str(e):
+            print("ERROR: Too many requests on mistral client")
+            gr.Warning("Unfortunately Mistral is unable to process")
+            output = "Unfortuanately I am not able to process your request now !"
+        else:
+            print("Unhandled Exception: ", str(e))
+            gr.Warning("Unfortunately Mistral is unable to process")
+            output = "I do not know what happened but I could not understand you ."
+    return output
+def transcribe(wav_path):
+    try:
+        # get first element from whisper_jax and strip it to delete begin and end space
+        return whisper_client.predict(
+            wav_path,  # str (filepath or URL to file) in 'inputs' Audio component
+            "transcribe",  # str in 'Task' Radio component
+            False,  # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
+            api_name="/predict",
+        )[0].strip()
+    except:
+        gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
+        return "There was a problem with my voice, tell me joke"
+# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
+def add_text(history, text):
+    history = [] if history is None else history
+    history = history + [(text, None)]
+    return history, gr.update(value="", interactive=False)
+def add_file(history, file):
+    history = [] if history is None else history
+    try:
+        text = transcribe(file)
+        print("Transcribed text:", text)
+    except Exception as e:
+        print(str(e))
+        gr.Warning("There was an issue with transcription, please try writing for now")
+        # Apply a null text on error
+        text = "Transcription seems failed, please tell me a joke about chickens"
+    history = history + [(text, None)]
+    return history, gr.update(value="", interactive=False)
+##NOTE: not using this as it yields a chacter each time while we need to feed history to TTS
+def bot(history, system_prompt=""):
+    history = [] if history is None else history
+    if system_prompt == "":
+        system_prompt = system_message
+    history[-1][1] = ""
+    for character in generate(history[-1][0], history[:-1]):
+        history[-1][1] = character
+        yield history
+def get_latents(speaker_wav):
+    # Generate speaker embedding and latents for TTS
+    (
+        gpt_cond_latent,
+        diffusion_conditioning,
+        speaker_embedding,
+    ) = model.get_conditioning_latents(audio_path=speaker_wav)
+    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
+latent_map = {}
+latent_map["Female_Voice"] = get_latents("examples/female.wav")
+def get_voice(prompt, language, latent_tuple, suffix="0"):
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
+    # Direct version
+    t0 = time.time()
+    out = model.inference(
+        prompt, language, gpt_cond_latent, speaker_embedding, diffusion_conditioning
+    )
+    inference_time = time.time() - t0
+    print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
+    real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
+    print(f"Real-time factor (RTF): {real_time_factor}")
+    wav_filename = f"output_{suffix}.wav"
+    torchaudio.save(wav_filename, torch.tensor(out["wav"]).unsqueeze(0), 24000)
+    return wav_filename
+def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
+    # This will create a wave header then append the frame input
+    # It should be first on a streaming wav file
+    # Other frames better should not have it (else you will hear some artifacts each chunk start)
+    wav_buf = io.BytesIO()
+    with wave.open(wav_buf, "wb") as vfout:
+        vfout.setnchannels(channels)
+        vfout.setsampwidth(sample_width)
+        vfout.setframerate(sample_rate)
+        vfout.writeframes(frame_input)
+    wav_buf.seek(0)
+    return wav_buf.read()
+def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
+    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
+    try:
+        t0 = time.time()
+        chunks = model.inference_stream(
+            prompt,
+            language,
+            gpt_cond_latent,
+            speaker_embedding,
+        )
+        first_chunk = True
+        for i, chunk in enumerate(chunks):
+            if first_chunk:
+                first_chunk_time = time.time() - t0
+                metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
+                first_chunk = False
+            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
+            # In case output is required to be multiple voice files
+            # out_file = f'{char}_{i}.wav'
+            # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
+            # audio = AudioSegment.from_file(out_file)
+            # audio.export(out_file, format='wav')
+            # return out_file
+            # directly return chunk as bytes for streaming
+            chunk = chunk.detach().cpu().numpy().squeeze()
+            chunk = (chunk * 32767).astype(np.int16)
+            yield chunk.tobytes()
+    except RuntimeError as e:
+        if "device-side assert" in str(e):
+            # cannot do anything on cuda device side error, need tor estart
+            print(
+                f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
+                flush=True,
+            )
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            print("Cuda device-assert Runtime encountered need restart")
+            # HF Space specific.. This error is unrecoverable need to restart space
+            api.restart_space(repo_id=repo_id)
+        else:
+            print("RuntimeError: non device-side assert error:", str(e))
+            gr.Warning("Unhandled Exception encounter, please retry in a minute")
+            return None
+        return None
+    except:
+        return None
+def get_sentence(history, system_prompt=""):
+    history = [] if history is None else history
+    if system_prompt == "":
+        system_prompt = system_message
+    history[-1][1] = ""
+    mistral_start = time.time()
+    print("Mistral start")
+    sentence_list = []
+    sentence_hash_list = []
+    text_to_generate = ""
+    for character in generate(history[-1][0], history[:-1]):
+        history[-1][1] = character
+        # It is coming word by word
+        text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
+        if len(text_to_generate) > 1:
+            dif = len(text_to_generate) - len(sentence_list)
+            if dif == 1 and len(sentence_list) != 0:
+                continue
+            sentence = text_to_generate[len(sentence_list)]
+            # This is expensive replace with hashing!
+            sentence_hash = hash(sentence)
+            if sentence_hash not in sentence_hash_list:
+                sentence_hash_list.append(sentence_hash)
+                sentence_list.append(sentence)
+                print("New Sentence: ", sentence)
+                yield (sentence, history)
+    # return that final sentence token
+    # TODO need a counter that one may be replica as before
+    last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
+    sentence_hash = hash(last_sentence)
+    if sentence_hash not in sentence_hash_list:
+        sentence_hash_list.append(sentence_hash)
+        sentence_list.append(last_sentence)
+        print("New Sentence: ", last_sentence)
+        yield (last_sentence, history)
+def generate_speech(history):
+    language = "en"
+    wav_bytestream = b""
+    for sentence, history in get_sentence(history):
+        print(sentence)
+        # Sometimes prompt </s> coming on output remove it
+        sentence = sentence.replace("</s>", "")
+        # A fast fix for last chacter, may produce weird sounds if it is with text
+        if sentence[-1] in ["!", "?", ".", ","]:
+            # just add a space
+            sentence = sentence[:-1] + " " + sentence[-1]
+        print("Sentence for speech:", sentence)
+        try:
+            # generate speech using precomputed latents
+            # This is not streaming but it will be fast
+            # wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=len(wav_list))
+            audio_stream = get_voice_streaming(
+                sentence, language, latent_map["Female_Voice"]
+            )
+            wav_chunks = wave_header_chunk()
+            frame_length = 0
+            for chunk in audio_stream:
+                try:
+                    wav_bytestream += chunk
+                    if DIRECT_STREAM:
+                        yield (
+                            gr.Audio.update(
+                                value=wave_header_chunk() + chunk, autoplay=True
+                            ),
+                            history,
+                        )
+                        wait_time = len(chunk) / 2 / 24000
+                        wait_time = AUDIO_WAIT_MODIFIER * wait_time
+                        print("Sleeping till chunk end")
+                        time.sleep(wait_time)
+                    else:
+                        wav_chunks += chunk
+                        frame_length += len(chunk)
+                except:
+                    # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
+                    continue
+            if not DIRECT_STREAM:
+                yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
+                # Streaming wait time calculation
+                # audio_length = frame_length / sample_width/ frame_rate
+                wait_time = frame_length / 2 / 24000
+                # for non streaming
+                # wait_time= librosa.get_duration(path=wav)
+                wait_time = AUDIO_WAIT_MODIFIER * wait_time
+                print("Sleeping till audio end")
+                time.sleep(wait_time)
+        except RuntimeError as e:
+            if "device-side assert" in str(e):
+                # cannot do anything on cuda device side error, need tor estart
+                print(
+                    f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
+                    flush=True,
+                )
+                gr.Warning("Unhandled Exception encounter, please retry in a minute")
+                print("Cuda device-assert Runtime encountered need restart")
+                # HF Space specific.. This error is unrecoverable need to restart space
+                api.restart_space(repo_id=repo_id)
+            else:
+                print("RuntimeError: non device-side assert error:", str(e))
+                raise e
+    # Spoken on autoplay everysencen now produce a concataned one at the one
+    # requires pip install ffmpeg-python
+    # files_to_concat= [ffmpeg.input(w) for w in wav_list]
+    # combined_file_name="combined.wav"
+    # ffmpeg.concat(*files_to_concat,v=0, a=1).output(combined_file_name).run(overwrite_output=True)
+    # final_audio.update(value=combined_file_name, visible=True)
+    # yield (combined_file_name, history
+    wav_bytestream = wave_header_chunk() + wav_bytestream
+    time.sleep(0.3)
+    yield (gr.Audio.update(value=None, autoplay=False), history)
+    yield (gr.Audio.update(value=wav_bytestream, autoplay=False), history)
+css = """
+.bot .chatbot p {
+  overflow: hidden; /* Ensures the content is not revealed until the animation */
+  //border-right: .15em solid orange; /* The typwriter cursor */
+  white-space: nowrap; /* Keeps the content on a single line */
+  margin: 0 auto; /* Gives that scrolling effect as the typing happens */
+  letter-spacing: .15em; /* Adjust as needed */
+  animation:
+    typing 3.5s steps(40, end);
+    blink-caret .75s step-end infinite;
+}
+/* The typing effect */
+@keyframes typing {
+  from { width: 0 }
+  to { width: 100% }
+}
+/* The typewriter cursor effect */
+@keyframes blink-caret {
+  from, to { border-color: transparent }
+  50% { border-color: orange; }
+}
+"""
+with gr.Blocks(title=title) as demo:
+    gr.Markdown(DESCRIPTION)
+    chatbot = gr.Chatbot(
+        [],
+        elem_id="chatbot",
+        avatar_images=("examples/lama.jpeg", "examples/lama2.jpeg"),
+        bubble_full_width=False,
+    )
+    with gr.Row():
+        txt = gr.Textbox(
+            scale=3,
+            show_label=False,
+            placeholder="Enter text and press enter, or speak to your microphone",
+            container=False,
+        )
+        txt_btn = gr.Button(value="Submit text", scale=1)
+        btn = gr.Audio(source="microphone", type="filepath", scale=4)
+    with gr.Row():
+        audio = gr.Audio(
+            label="Generated audio response",
+            streaming=False,
+            autoplay=False,
+            interactive=True,
+            show_label=True,
+        )
+        # TODO add a second audio that plays whole sentences (for mobile especially)
+        # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
+    clear_btn = gr.ClearButton([chatbot, audio])
+    txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        generate_speech, chatbot, [audio, chatbot]
+    )
+    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
+    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
+        generate_speech, chatbot, [audio, chatbot]
+    )
+    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
+    file_msg = btn.stop_recording(
+        add_file, [chatbot, btn], [chatbot, txt], queue=False
+    ).then(generate_speech, chatbot, [audio, chatbot])
+    gr.Markdown(
+        """
+This Space demonstrates how to speak to a chatbot, based solely on open-source models.
+It relies on 3 models:
+1. [Whisper-large-v2](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
+2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
+3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
+Note:
+- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
+    )
+demo.queue()
+demo.launch(debug=True)