jazy707 commited on
Commit
cb12e9d
·
verified ·
1 Parent(s): 4a8825d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +181 -0
app.py ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
3
+ | Step 1: Set Up | | Step 2: Set Up Gradio | | Step 3: Speech-to-Text | | Step 4: Text-to-Speech |
4
+ | Environment | | Interface | | & Language Model Processing | | Output |
5
+ +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
6
+ | | | | | | | |
7
+ | - Import Python | | - Define interface | | - Transcribe audio | | - XTTS model generates |
8
+ | libraries | | components | | to text using | | spoken response from |
9
+ | - Initialize models: |--------> - Configure audio and |------->| Faster Whisper ASR |------->| LLM's text response |
10
+ | Whisper, Mistral, | | text interaction | | - Transcribed text | | |
11
+ | XTTS | | - Launch interface | | is added to | | |
12
+ | | | | | chatbot's history | | |
13
+ | | | | | - Mistral LLM | | |
14
+ | | | | | processes chatbot | | |
15
+ | | | | | history to generate | | |
16
+ | | | | | response | | |
17
+ +----------------------+ +-------------------------+ +-------------------------------+ +-------------------------+
18
+ '''
19
+
20
+ ###### Set Up Environment ######
21
+
22
+ import os
23
+ # Set CUDA environment variable and install llama-cpp-python
24
+ # llama-cpp-python is a python binding for llama.cpp library which enables LLM inference in pure C/C++
25
+ os.environ["CUDACXX"] = "/usr/local/cuda/bin/nvcc"
26
+ os.system('python -m unidic download')
27
+ os.system('CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python==0.2.11 --verbose')
28
+
29
+
30
+ # Third-party library imports
31
+ from faster_whisper import WhisperModel
32
+ import gradio as gr
33
+ from huggingface_hub import hf_hub_download
34
+ from llama_cpp import Llama
35
+ from TTS.tts.configs.xtts_config import XttsConfig
36
+ from TTS.tts.models.xtts import Xtts
37
+ from TTS.utils.generic_utils import get_user_data_dir
38
+ from TTS.utils.manage import ModelManager
39
+
40
+ # Local imports
41
+ from utils import get_sentence, generate_speech_for_sentence, wave_header_chunk
42
+
43
+ # Load Whisper ASR model
44
+ print("Loading Whisper ASR")
45
+ whisper_model = WhisperModel("large-v3", device="cuda", compute_type="float16")
46
+
47
+ # Load Mistral LLM
48
+ print("Loading Mistral LLM")
49
+ hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
50
+ mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
51
+ mistral_llm = Llama(model_path=mistral_model_path,n_gpu_layers=35,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=False)
52
+
53
+
54
+ # Load XTTS Model
55
+ print("Loading XTTS model")
56
+ os.environ["COQUI_TOS_AGREED"] = "1"
57
+ tts_model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
58
+ ModelManager().download_model(tts_model_name)
59
+ tts_model_path = os.path.join(get_user_data_dir("tts"), tts_model_name.replace("/", "--"))
60
+ config = XttsConfig()
61
+ config.load_json(os.path.join(tts_model_path, "config.json"))
62
+ xtts_model = Xtts.init_from_config(config)
63
+ xtts_model.load_checkpoint(
64
+ config,
65
+ checkpoint_path=os.path.join(tts_model_path, "model.pth"),
66
+ vocab_path=os.path.join(tts_model_path, "vocab.json"),
67
+ eval=True,
68
+ use_deepspeed=True,
69
+ )
70
+ xtts_model.cuda()
71
+
72
+ ###### Set up Gradio Interface ######
73
+
74
+ with gr.Blocks(title="Voice chat with LLM") as demo:
75
+ DESCRIPTION = """# Voice chat with LLM"""
76
+ gr.Markdown(DESCRIPTION)
77
+
78
+ # Define chatbot component
79
+ chatbot = gr.Chatbot(
80
+ value=[(None, "Hi friend, I'm Amy, an AI coach. How can I help you today?")], # Initial greeting from the chatbot
81
+ elem_id="chatbot",
82
+ avatar_images=("examples/hf-logo.png", "examples/ai-chat-logo.png"),
83
+ bubble_full_width=False,
84
+ )
85
+
86
+ # Define chatbot voice component
87
+ VOICES = ["female", "male"]
88
+ with gr.Row():
89
+ chatbot_voice = gr.Dropdown(
90
+ label="Voice of the Chatbot",
91
+ info="How should Chatbot talk like",
92
+ choices=VOICES,
93
+ max_choices=1,
94
+ value=VOICES[0],
95
+ )
96
+
97
+ # Define text and audio record input components
98
+ with gr.Row():
99
+ txt_box = gr.Textbox(
100
+ scale=3,
101
+ show_label=False,
102
+ placeholder="Enter text and press enter, or speak to your microphone",
103
+ container=False,
104
+ interactive=True,
105
+ )
106
+ audio_record = gr.Audio(source="microphone", type="filepath", scale=4)
107
+
108
+ # Define generated audio playback component
109
+ with gr.Row():
110
+ sentence = gr.Textbox(visible=False)
111
+ audio_playback = gr.Audio(
112
+ value=None,
113
+ label="Generated audio response",
114
+ streaming=True,
115
+ autoplay=True,
116
+ interactive=False,
117
+ show_label=True,
118
+ )
119
+
120
+ # Will be triggered on text submit (will send to generate_speech)
121
+ def add_text(chatbot_history, text):
122
+ chatbot_history = [] if chatbot_history is None else chatbot_history
123
+ chatbot_history = chatbot_history + [(text, None)]
124
+ return chatbot_history, gr.update(value="", interactive=False)
125
+
126
+ # Will be triggered on voice submit (will transribe and send to generate_speech)
127
+ def add_audio(chatbot_history, audio):
128
+ chatbot_history = [] if chatbot_history is None else chatbot_history
129
+ # get result from whisper and strip it to delete begin and end space
130
+ response, _ = whisper_model.transcribe(audio)
131
+ text = list(response)[0].text.strip()
132
+ print("Transcribed text:", text)
133
+ chatbot_history = chatbot_history + [(text, None)]
134
+ return chatbot_history, gr.update(value="", interactive=False)
135
+
136
+ def generate_speech(chatbot_history, chatbot_voice, initial_greeting=False):
137
+ # Start by yielding an initial empty audio to set up autoplay
138
+ yield ("", chatbot_history, wave_header_chunk())
139
+
140
+ # Helper function to handle the speech generation and yielding process
141
+ def handle_speech_generation(sentence, chatbot_history, chatbot_voice):
142
+ if sentence != "":
143
+ print("Processing sentence")
144
+ generated_speech = generate_speech_for_sentence(chatbot_history, chatbot_voice, sentence, xtts_model, xtts_supported_languages=config.languages, return_as_byte=True)
145
+ if generated_speech is not None:
146
+ _, audio_dict = generated_speech
147
+ yield (sentence, chatbot_history, audio_dict["value"])
148
+
149
+ if initial_greeting:
150
+ # Process only the initial greeting if specified
151
+ for _, sentence in chatbot_history:
152
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
153
+ else:
154
+ # Continuously get and process sentences from a generator function
155
+ for sentence, chatbot_history in get_sentence(chatbot_history, mistral_llm):
156
+ print("Inserting sentence to queue")
157
+ yield from handle_speech_generation(sentence, chatbot_history, chatbot_voice)
158
+
159
+ txt_msg = txt_box.submit(fn=add_text, inputs=[chatbot, txt_box], outputs=[chatbot, txt_box], queue=False
160
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
161
+
162
+ txt_msg.then(fn=lambda: gr.update(interactive=True), inputs=None, outputs=[txt_box], queue=False)
163
+
164
+ audio_msg = audio_record.stop_recording(fn=add_audio, inputs=[chatbot, audio_record], outputs=[chatbot, txt_box], queue=False
165
+ ).then(fn=generate_speech, inputs=[chatbot,chatbot_voice], outputs=[sentence, chatbot, audio_playback])
166
+
167
+ audio_msg.then(fn=lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), inputs=None, outputs=[txt_box, audio_record], queue=False)
168
+
169
+ FOOTNOTE = """
170
+ This Space demonstrates how to speak to an llm chatbot, based solely on open accessible models.
171
+ It relies on the following models :
172
+ - Speech to Text Model: [Faster-Whisper-large-v3](https://huggingface.co/Systran/faster-whisper-large-v3) an ASR model, to transcribe recorded audio to text.
173
+ - Large Language Model: [Mistral-7b-instruct-v0.1-quantized](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF) a LLM to generate the chatbot responses.
174
+ - Text to Speech Model: [XTTS-v2](https://huggingface.co/spaces/coqui/xtts) a TTS model, to generate the voice of the chatbot.
175
+
176
+ Note:
177
+ - Responses generated by chat model should not be assumed correct or taken serious, as this is a demonstration example only
178
+ - iOS (Iphone/Ipad) devices may not experience voice due to autoplay being disabled on these devices by Vendor"""
179
+ gr.Markdown(FOOTNOTE)
180
+ demo.load(fn=generate_speech, inputs=[chatbot,chatbot_voice, gr.State(value=True)], outputs=[sentence, chatbot, audio_playback])
181
+ demo.queue().launch(debug=True,share=True)