Spaces:

coqui
/

voice-chat-with-mistral

Paused

App Files Files Community

ggoknar commited on Oct 17, 2023

Commit

da4b074

1 Parent(s): d3d83c1

fix repo name

Browse files

Files changed (1) hide show

app.py +1 -431

app.py CHANGED Viewed

@@ -68,7 +68,7 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
-repo_id = "coqui/voice-chat-with-lama"
 default_system_message = """
 You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
@@ -106,433 +106,3 @@ text_client = InferenceClient(
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
 )
-###### COQUI TTS FUNCTIONS ######
-def get_latents(speaker_wav):
-    # create as function as we can populate here with voice cleanup/filtering
-    (
-        gpt_cond_latent,
-        diffusion_conditioning,
-        speaker_embedding,
-    ) = model.get_conditioning_latents(audio_path=speaker_wav)
-    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
-def format_prompt(message, history):
-    prompt = (
-        "<s>[INST]"
-        + system_message
-        + "[/INST] I understand, I am a Mistral chatbot with speech by Coqui team.</s>"
-    )
-    for user_prompt, bot_response in history:
-        prompt += f"[INST] {user_prompt} [/INST]"
-        prompt += f" {bot_response}</s> "
-    prompt += f"[INST] {message} [/INST]"
-    return prompt
-def generate(
-    prompt,
-    history,
-    temperature=0.9,
-    max_new_tokens=256,
-    top_p=0.95,
-    repetition_penalty=1.0,
-):
-    temperature = float(temperature)
-    if temperature < 1e-2:
-        temperature = 1e-2
-    top_p = float(top_p)
-    generate_kwargs = dict(
-        temperature=temperature,
-        max_new_tokens=max_new_tokens,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-        do_sample=True,
-        seed=42,
-    )
-    formatted_prompt = format_prompt(prompt, history)
-    try:
-        stream = text_client.text_generation(
-            formatted_prompt,
-            **generate_kwargs,
-            stream=True,
-            details=True,
-            return_full_text=False,
-        )
-        output = ""
-        for response in stream:
-            output += response.token.text
-            yield output
-    except Exception as e:
-        if "Too Many Requests" in str(e):
-            print("ERROR: Too many requests on mistral client")
-            gr.Warning("Unfortunately Mistral is unable to process")
-            output = "Unfortuanately I am not able to process your request now !"
-        else:
-            print("Unhandled Exception: ", str(e))
-            gr.Warning("Unfortunately Mistral is unable to process")
-            output = "I do not know what happened but I could not understand you ."
-    return output
-def transcribe(wav_path):
-    try:
-        # get first element from whisper_jax and strip it to delete begin and end space
-        return whisper_client.predict(
-            wav_path,  # str (filepath or URL to file) in 'inputs' Audio component
-            "transcribe",  # str in 'Task' Radio component
-            False,  # return_timestamps=False for whisper-jax https://gist.github.com/sanchit-gandhi/781dd7003c5b201bfe16d28634c8d4cf#file-whisper_jax_endpoint-py
-            api_name="/predict",
-        )[0].strip()
-    except:
-        gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
-        return "There was a problem with my voice, tell me joke"
-# Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
-def add_text(history, text):
-    history = [] if history is None else history
-    history = history + [(text, None)]
-    return history, gr.update(value="", interactive=False)
-def add_file(history, file):
-    history = [] if history is None else history
-    try:
-        text = transcribe(file)
-        print("Transcribed text:", text)
-    except Exception as e:
-        print(str(e))
-        gr.Warning("There was an issue with transcription, please try writing for now")
-        # Apply a null text on error
-        text = "Transcription seems failed, please tell me a joke about chickens"
-    history = history + [(text, None)]
-    return history, gr.update(value="", interactive=False)
-##NOTE: not using this as it yields a chacter each time while we need to feed history to TTS
-def bot(history, system_prompt=""):
-    history = [] if history is None else history
-    if system_prompt == "":
-        system_prompt = system_message
-    history[-1][1] = ""
-    for character in generate(history[-1][0], history[:-1]):
-        history[-1][1] = character
-        yield history
-def get_latents(speaker_wav):
-    # Generate speaker embedding and latents for TTS
-    (
-        gpt_cond_latent,
-        diffusion_conditioning,
-        speaker_embedding,
-    ) = model.get_conditioning_latents(audio_path=speaker_wav)
-    return gpt_cond_latent, diffusion_conditioning, speaker_embedding
-latent_map = {}
-latent_map["Female_Voice"] = get_latents("examples/female.wav")
-def get_voice(prompt, language, latent_tuple, suffix="0"):
-    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
-    # Direct version
-    t0 = time.time()
-    out = model.inference(
-        prompt, language, gpt_cond_latent, speaker_embedding, diffusion_conditioning
-    )
-    inference_time = time.time() - t0
-    print(f"I: Time to generate audio: {round(inference_time*1000)} milliseconds")
-    real_time_factor = (time.time() - t0) / out["wav"].shape[-1] * 24000
-    print(f"Real-time factor (RTF): {real_time_factor}")
-    wav_filename = f"output_{suffix}.wav"
-    torchaudio.save(wav_filename, torch.tensor(out["wav"]).unsqueeze(0), 24000)
-    return wav_filename
-def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=24000):
-    # This will create a wave header then append the frame input
-    # It should be first on a streaming wav file
-    # Other frames better should not have it (else you will hear some artifacts each chunk start)
-    wav_buf = io.BytesIO()
-    with wave.open(wav_buf, "wb") as vfout:
-        vfout.setnchannels(channels)
-        vfout.setsampwidth(sample_width)
-        vfout.setframerate(sample_rate)
-        vfout.writeframes(frame_input)
-    wav_buf.seek(0)
-    return wav_buf.read()
-def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
-    gpt_cond_latent, diffusion_conditioning, speaker_embedding = latent_tuple
-    try:
-        t0 = time.time()
-        chunks = model.inference_stream(
-            prompt,
-            language,
-            gpt_cond_latent,
-            speaker_embedding,
-        )
-        first_chunk = True
-        for i, chunk in enumerate(chunks):
-            if first_chunk:
-                first_chunk_time = time.time() - t0
-                metrics_text = f"Latency to first audio chunk: {round(first_chunk_time*1000)} milliseconds\n"
-                first_chunk = False
-            print(f"Received chunk {i} of audio length {chunk.shape[-1]}")
-            # In case output is required to be multiple voice files
-            # out_file = f'{char}_{i}.wav'
-            # write(out_file, 24000, chunk.detach().cpu().numpy().squeeze())
-            # audio = AudioSegment.from_file(out_file)
-            # audio.export(out_file, format='wav')
-            # return out_file
-            # directly return chunk as bytes for streaming
-            chunk = chunk.detach().cpu().numpy().squeeze()
-            chunk = (chunk * 32767).astype(np.int16)
-            yield chunk.tobytes()
-    except RuntimeError as e:
-        if "device-side assert" in str(e):
-            # cannot do anything on cuda device side error, need tor estart
-            print(
-                f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
-                flush=True,
-            )
-            gr.Warning("Unhandled Exception encounter, please retry in a minute")
-            print("Cuda device-assert Runtime encountered need restart")
-            # HF Space specific.. This error is unrecoverable need to restart space
-            api.restart_space(repo_id=repo_id)
-        else:
-            print("RuntimeError: non device-side assert error:", str(e))
-            gr.Warning("Unhandled Exception encounter, please retry in a minute")
-            return None
-        return None
-    except:
-        return None
-def get_sentence(history, system_prompt=""):
-    history = [] if history is None else history
-    if system_prompt == "":
-        system_prompt = system_message
-    history[-1][1] = ""
-    mistral_start = time.time()
-    print("Mistral start")
-    sentence_list = []
-    sentence_hash_list = []
-    text_to_generate = ""
-    for character in generate(history[-1][0], history[:-1]):
-        history[-1][1] = character
-        # It is coming word by word
-        text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())
-        if len(text_to_generate) > 1:
-            dif = len(text_to_generate) - len(sentence_list)
-            if dif == 1 and len(sentence_list) != 0:
-                continue
-            sentence = text_to_generate[len(sentence_list)]
-            # This is expensive replace with hashing!
-            sentence_hash = hash(sentence)
-            if sentence_hash not in sentence_hash_list:
-                sentence_hash_list.append(sentence_hash)
-                sentence_list.append(sentence)
-                print("New Sentence: ", sentence)
-                yield (sentence, history)
-    # return that final sentence token
-    # TODO need a counter that one may be replica as before
-    last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
-    sentence_hash = hash(last_sentence)
-    if sentence_hash not in sentence_hash_list:
-        sentence_hash_list.append(sentence_hash)
-        sentence_list.append(last_sentence)
-        print("New Sentence: ", last_sentence)
-        yield (last_sentence, history)
-def generate_speech(history):
-    language = "en"
-    wav_list = []
-    for sentence, history in get_sentence(history):
-        print(sentence)
-        # Sometimes prompt </s> coming on output remove it
-        sentence = sentence.replace("</s>", "")
-        # A fast fix for last chacter, may produce weird sounds if it is with text
-        if sentence[-1] in ["!", "?", ".", ","]:
-            # just add a space
-            sentence = sentence[:-1] + " " + sentence[-1]
-        print("Sentence for speech:", sentence)
-        try:
-            # generate speech using precomputed latents
-            # This is not streaming but it will be fast
-            # wav = get_voice(sentence,language, latent_map["Female_Voice"], suffix=len(wav_list))
-            audio_stream = get_voice_streaming(
-                sentence, language, latent_map["Female_Voice"], suffix=len(wav_list)
-            )
-            wav_chunks = wave_header_chunk()
-            frame_length = 0
-            for chunk in audio_stream:
-                try:
-                    wav_chunks += chunk
-                    frame_length += len(chunk)
-                except:
-                    # hack to continue on playing. sometimes last chunk is empty , will be fixed on next TTS
-                    continue
-            wav_list.append(wav_chunks)
-            yield (gr.Audio.update(value=wav_chunks, autoplay=True), history)
-            # Streaming wait time calculation
-            # audio_length = frame_length / sample_width/ frame_rate
-            wait_time = frame_length / 2 / 24000 + 0.5  # plus 500ms
-            # for non streaming
-            # wait_time= librosa.get_duration(path=wav)
-            wait_time = AUDIO_WAIT_MODIFIER * wait_time
-            print("Sleeping till audio end")
-            time.sleep(wait_time)
-        except RuntimeError as e:
-            if "device-side assert" in str(e):
-                # cannot do anything on cuda device side error, need tor estart
-                print(
-                    f"Exit due to: Unrecoverable exception caused by prompt:{sentence}",
-                    flush=True,
-                )
-                gr.Warning("Unhandled Exception encounter, please retry in a minute")
-                print("Cuda device-assert Runtime encountered need restart")
-                # HF Space specific.. This error is unrecoverable need to restart space
-                api.restart_space(repo_id=repo_id)
-            else:
-                print("RuntimeError: non device-side assert error:", str(e))
-                raise e
-    # Spoken on autoplay everysencen now produce a concataned one at the one
-    # requires pip install ffmpeg-python
-    # files_to_concat= [ffmpeg.input(w) for w in wav_list]
-    # combined_file_name="combined.wav"
-    # ffmpeg.concat(*files_to_concat,v=0, a=1).output(combined_file_name).run(overwrite_output=True)
-    # final_audio.update(value=combined_file_name, visible=True)
-    # yield (combined_file_name, history)
-css = """
-.bot .chatbot p {
-  overflow: hidden; /* Ensures the content is not revealed until the animation */
-  //border-right: .15em solid orange; /* The typwriter cursor */
-  white-space: nowrap; /* Keeps the content on a single line */
-  margin: 0 auto; /* Gives that scrolling effect as the typing happens */
-  letter-spacing: .15em; /* Adjust as needed */
-  animation:
-    typing 3.5s steps(40, end);
-    blink-caret .75s step-end infinite;
-}
-/* The typing effect */
-@keyframes typing {
-  from { width: 0 }
-  to { width: 100% }
-}
-/* The typewriter cursor effect */
-@keyframes blink-caret {
-  from, to { border-color: transparent }
-  50% { border-color: orange; }
-}
-"""
-with gr.Blocks(title=title) as demo:
-    gr.Markdown(DESCRIPTION)
-    chatbot = gr.Chatbot(
-        [],
-        elem_id="chatbot",
-        avatar_images=("examples/lama.jpeg", "examples/lama2.jpeg"),
-        bubble_full_width=False,
-    )
-    with gr.Row():
-        txt = gr.Textbox(
-            scale=3,
-            show_label=False,
-            placeholder="Enter text and press enter, or speak to your microphone",
-            container=False,
-        )
-        txt_btn = gr.Button(value="Submit text", scale=1)
-        btn = gr.Audio(source="microphone", type="filepath", scale=4)
-    with gr.Row():
-        audio = gr.Audio(
-            label="Generated audio response",
-            streaming=False,
-            autoplay=False,
-            interactive=True,
-            show_label=True,
-        )
-        # TODO add a second audio that plays whole sentences (for mobile especially)
-        # final_audio = gr.Audio(label="Final audio response", streaming=False, autoplay=False, interactive=False,show_label=True, visible=False)
-    clear_btn = gr.ClearButton([chatbot, audio])
-    txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
-        generate_speech, chatbot, [audio, chatbot]
-    )
-    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
-    txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
-        generate_speech, chatbot, [audio, chatbot]
-    )
-    txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
-    file_msg = btn.stop_recording(
-        add_file, [chatbot, btn], [chatbot, txt], queue=False
-    ).then(generate_speech, chatbot, [audio, chatbot])
-    gr.Markdown(
-        """
-This Space demonstrates how to speak to a chatbot, based solely on open-source models.
-It relies on 3 models:
-1. [Whisper-large-v2](https://huggingface.co/spaces/sanchit-gandhi/whisper-jax) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
-2. [Mistral-7b-instruct](https://huggingface.co/spaces/osanseviero/mistral-super-fast) as the chat model, the actual chat model. It is called from [huggingface_hub](https://huggingface.co/docs/huggingface_hub/guides/inference).
-3. [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a TTS model, to generate the chatbot answers. This time, the model is hosted locally.
-Note:
-- By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml"""
-    )
-demo.queue()
-demo.launch(debug=True)

 # will use api to restart space on a unrecoverable error
 api = HfApi(token=HF_TOKEN)
+repo_id = "ylacombe/voice-chat-with-mistral"
 default_system_message = """
 You are Mistral, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
     "mistralai/Mistral-7B-Instruct-v0.1",
     timeout=WHISPER_TIMEOUT,
 )