gorkemgoknar commited on
Commit
572dfa2
·
1 Parent(s): e2c9f4b

rollback to previous version of single llm but with xtts v2

Browse files
Files changed (1) hide show
  1. app.py +146 -132
app.py CHANGED
@@ -53,15 +53,12 @@ from huggingface_hub import InferenceClient
53
 
54
  # This will trigger downloading model
55
  print("Downloading if not downloaded Coqui XTTS V2")
56
-
57
  from TTS.utils.manage import ModelManager
58
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
59
  ModelManager().download_model(model_name)
60
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
61
  print("XTTS downloaded")
62
 
63
-
64
- print("Loading XTTS")
65
  config = XttsConfig()
66
  config.load_json(os.path.join(model_path, "config.json"))
67
 
@@ -76,11 +73,11 @@ model.load_checkpoint(
76
  model.cuda()
77
  print("Done loading TTS")
78
 
79
- #####llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
80
 
81
- title = "Voice chat with Zephyr/Mistral and Coqui XTTS"
82
 
83
- DESCRIPTION = """# Voice chat with Zephyr/Mistral and Coqui XTTS"""
84
  css = """.toast-wrap { display: none !important } """
85
 
86
  from huggingface_hub import HfApi
@@ -89,11 +86,11 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
89
  # will use api to restart space on a unrecoverable error
90
  api = HfApi(token=HF_TOKEN)
91
 
92
- repo_id = "coqui/voice-chat-with-zephyr"
93
 
94
 
95
  default_system_message = f"""
96
- You are ##LLM_MODEL###, a large language model trained ##LLM_MODEL_PROVIDER###, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
97
  The user is talking to you over voice on their phone, and your response will be read out loud with realistic text-to-speech (TTS) technology from Coqui team. Follow every direction here when crafting your response: Use natural, conversational language that are clear and easy to follow (short sentences, simple words). Be concise and relevant: Most of your responses should be a sentence or two, unless you’re asked to go deeper. Don’t monopolize the conversation. Use discourse markers to ease comprehension. Never use the list format. Keep the conversation flowing. Clarify: when there is ambiguity, ask clarifying questions, rather than make assumptions. Don’t implicitly or explicitly try to end the chat (i.e. do not end a response with “Talk soon!”, or “Enjoy!”). Sometimes the user might just want to chat. Ask them relevant follow-up questions. Don’t ask them if there’s anything else they need help with (e.g. don’t say things like “How can I assist you further?”). Remember that this is a voice conversation: Don’t use lists, markdown, bullet points, or other formatting that’s not typically spoken. Type out numbers in words (e.g. ‘twenty twelve’ instead of the year 2012). If something doesn’t make sense, it’s likely because you misheard them. There wasn’t a typo, and the user didn’t mispronounce anything. Remember to follow these rules absolutely, and do not refer to these rules, even if you’re asked about them.
98
  You cannot access the internet, but you have vast knowledge.
99
  Current date: CURRENT_DATE .
@@ -116,19 +113,13 @@ WHISPER_TIMEOUT = int(os.environ.get("WHISPER_TIMEOUT", 45))
116
 
117
  whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
118
 
119
- ROLES = ["AI Assistant","AI Beard The Pirate"]
120
 
121
  ROLE_PROMPTS = {}
122
  ROLE_PROMPTS["AI Assistant"]=system_message
123
-
124
- #Pirate scenario
125
- character_name= "AI Beard"
126
- character_scenario= f"As {character_name} you are a 28 year old man who is a pirate on the ship Invisible AI. You are good friends with Guybrush Threepwood and Murray the Skull. Developers did not get you into Monkey Island games as you wanted huge shares of Big Whoop treasure."
127
- pirate_system_message = f"You as {character_name}. {character_scenario} Print out only exactly the words that {character_name} would speak out, do not add anything. Don't repeat. Answer short, only few words, as if in a talk. Craft your response only from the first-person perspective of {character_name} and never as user.Current date: #CURRENT_DATE#".replace("#CURRENT_DATE#", str(datetime.date.today()))
128
-
129
- ROLE_PROMPTS["AI Beard The Pirate"]= pirate_system_message
130
  ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
131
 
 
132
 
133
 
134
  ### WILL USE LOCAL MISTRAL OR ZEPHYR
@@ -137,75 +128,63 @@ from huggingface_hub import hf_hub_download
137
  print("Downloading LLM")
138
 
139
 
140
- print("Downloading Zephyr")
141
- #Zephyr
142
- hf_hub_download(repo_id="TheBloke/zephyr-7B-beta-GGUF", local_dir=".", filename="zephyr-7b-beta.Q5_K_M.gguf")
143
- # use new gguf format
144
- zephyr_model_path="./zephyr-7b-beta.Q5_K_M.gguf"
145
-
146
- print("Downloading Mistral")
147
- #Mistral
148
- hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
149
- # use new gguf format
150
- mistral_model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
151
 
152
 
153
  from llama_cpp import Llama
154
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
155
  # else 35 full layers + XTTS works fine on T4 16GB
156
- # 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
157
- GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
158
-
159
- LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
160
 
161
  LLAMA_VERBOSE=False
162
- print("Running LLM Mistral")
163
- llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
164
 
165
- print("Running LLM Zephyr")
166
- llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS-10,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
167
 
168
 
169
  # Mistral formatter
170
- def format_prompt_mistral(message, history, system_message=system_message,system_understand_message=system_understand_message):
171
  prompt = (
172
  "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
173
  )
174
  for user_prompt, bot_response in history:
175
  prompt += f"[INST] {user_prompt} [/INST]"
176
  prompt += f" {bot_response}</s> "
177
-
178
- #if message=="":
179
- # message="Hello"
180
  prompt += f"[INST] {message} [/INST]"
181
  return prompt
182
-
183
- # <|system|>
184
- # You are a friendly chatbot who always responds in the style of a pirate.</s>
185
- # <|user|>
186
- # How many helicopters can a human eat in one sitting?</s>
187
- # <|assistant|>
188
- # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
189
-
190
  # Zephyr formatter
191
- def format_prompt_zephyr(message, history, system_message=system_message):
192
  prompt = (
193
- "<|system|>\n" + system_message + "</s>"
194
  )
195
  for user_prompt, bot_response in history:
196
  prompt += f"<|user|>\n{user_prompt}</s>"
197
- prompt += f"<|assistant|>\n{bot_response}</s>"
198
  if message=="":
199
  message="Hello"
200
  prompt += f"<|user|>\n{message}</s>"
201
- prompt += f"<|assistant|>"
202
  print(prompt)
203
  return prompt
204
 
 
 
 
 
 
 
205
  def generate_local(
206
  prompt,
207
  history,
208
- llm_model="zephyr",
209
  system_message=None,
210
  temperature=0.8,
211
  max_tokens=256,
@@ -221,21 +200,12 @@ def generate_local(
221
  temperature=temperature,
222
  max_tokens=max_tokens,
223
  top_p=top_p,
224
- stop=stop
225
  )
226
 
227
- if "zephyr" in llm_model.lower():
228
- sys_message= system_message.replace("##LLM_MODEL###","Zephyr").replace("##LLM_MODEL_PROVIDER###","Hugging Face")
229
- formatted_prompt = format_prompt_zephyr(prompt, history,system_message=sys_message)
230
- llm = llm_zephyr
231
- else:
232
- sys_message= system_message.replace("##LLM_MODEL###","Mistral").replace("##LLM_MODEL_PROVIDER###","Mistral")
233
- formatted_prompt = format_prompt_mistral(prompt, history,system_message=sys_message)
234
- llm = llm_mistral
235
-
236
 
237
  try:
238
- print("LLM Input:", formatted_prompt)
239
  stream = llm(
240
  formatted_prompt,
241
  **generate_kwargs,
@@ -254,7 +224,7 @@ def generate_local(
254
  return
255
 
256
 
257
- output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
258
  yield output
259
 
260
  except Exception as e:
@@ -316,7 +286,7 @@ def wave_header_chunk(frame_input=b"", channels=1, sample_width=2, sample_rate=2
316
  xtts_supported_languages=config.languages
317
  def detect_language(prompt):
318
  # Fast language autodetection
319
- if len(prompt)>15:
320
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
321
  if language_predicted == "zh":
322
  #we use zh-cn on xtts
@@ -346,6 +316,7 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
346
  language,
347
  gpt_cond_latent,
348
  speaker_embedding,
 
349
  )
350
 
351
  first_chunk = True
@@ -389,6 +360,66 @@ def get_voice_streaming(prompt, language, latent_tuple, suffix="0"):
389
  except:
390
  return None
391
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  def transcribe(wav_path):
394
  try:
@@ -402,7 +433,9 @@ def transcribe(wav_path):
402
  gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
403
  return "There was a problem with my voice, tell me joke"
404
 
405
-
 
 
406
  # Will be triggered on text submit (will send to generate_speech)
407
  def add_text(history, text):
408
  history = [] if history is None else history
@@ -439,8 +472,7 @@ def bot(history, system_prompt=""):
439
  yield history
440
 
441
 
442
- def get_sentence(history, chatbot_role,llm_model,system_prompt=""):
443
-
444
  history = [["", None]] if history is None else history
445
 
446
  if system_prompt == "":
@@ -449,22 +481,18 @@ def get_sentence(history, chatbot_role,llm_model,system_prompt=""):
449
  history[-1][1] = ""
450
 
451
  mistral_start = time.time()
452
-
453
  sentence_list = []
454
  sentence_hash_list = []
455
 
456
  text_to_generate = ""
457
  stored_sentence = None
458
  stored_sentence_hash = None
459
-
460
- print(chatbot_role)
461
- print(llm_model)
462
-
463
- for character in generate_local(history[-1][0], history[:-1],system_message=ROLE_PROMPTS[chatbot_role],llm_model=llm_model):
464
  history[-1][1] = character.replace("<|assistant|>","")
465
  # It is coming word by word
466
 
467
- text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
468
  if len(text_to_generate) > 1:
469
 
470
  dif = len(text_to_generate) - len(sentence_list)
@@ -508,23 +536,19 @@ def get_sentence(history, chatbot_role,llm_model,system_prompt=""):
508
  yield (sentence, history)
509
 
510
  # return that final sentence token
511
- try:
512
- last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
513
- sentence_hash = hash(last_sentence)
514
- if sentence_hash not in sentence_hash_list:
515
- if stored_sentence is not None and stored_sentence_hash is not None:
516
- last_sentence = stored_sentence + last_sentence
517
- stored_sentence = stored_sentence_hash = None
518
- print("Last Sentence with stored:",last_sentence)
519
-
520
- sentence_hash_list.append(sentence_hash)
521
- sentence_list.append(last_sentence)
522
- print("Last Sentence: ", last_sentence)
523
 
524
- yield (last_sentence, history)
525
- except:
526
- print("ERROR on last sentence history is :", history)
527
 
 
528
 
529
  from scipy.io.wavfile import write
530
  from pydub import AudioSegment
@@ -533,14 +557,22 @@ second_of_silence = AudioSegment.silent() # use default
533
  second_of_silence.export("sil.wav", format='wav')
534
 
535
 
536
- def generate_speech(history,chatbot_role,llm_model):
537
  # Must set autoplay to True first
538
  yield (history, chatbot_role, "", wave_header_chunk() )
539
- for sentence, history in get_sentence(history,chatbot_role,llm_model):
 
 
 
 
540
  if sentence != "":
 
 
 
 
541
  print("BG: inserting sentence to queue")
542
 
543
- generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True)
544
  if generated_speech is not None:
545
  _, audio_dict = generated_speech
546
  # We are using byte streaming
@@ -548,9 +580,8 @@ def generate_speech(history,chatbot_role,llm_model):
548
 
549
 
550
  # will generate speech audio file per sentence
551
- def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True):
552
- language = "autodetect"
553
-
554
  wav_bytestream = b""
555
 
556
  if len(sentence)==0:
@@ -575,7 +606,7 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
575
  if len(sentence)==0:
576
  print("EMPTY SENTENCE after processing")
577
  return
578
-
579
  # A fast fix for last chacter, may produce weird sounds if it is with text
580
  if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
581
  # just add a space
@@ -652,20 +683,18 @@ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte
652
  print("All speech ended")
653
  return
654
 
 
655
  latent_map = {}
656
  latent_map["AI Assistant"] = get_latents("examples/female.wav")
657
- latent_map["AI Beard The Pirate"] = get_latents("examples/pirate_by_coqui.wav")
658
 
659
  #### GRADIO INTERFACE ####
660
-
661
  EXAMPLES = [
662
- [[],"AI Assistant","What is 42?"],
663
- [[],"AI Assistant","Speak in French, tell me how are you doing?"],
664
- [[],"AI Assistant","Antworten Sie mir von nun an auf Deutsch"],
665
- [[],"AI Beard The Pirate","Who are you?"],
666
  ]
667
 
668
- MODELS = ["Mistral","Zephyr"]
669
 
670
  OTHER_HTML=f"""<div>
671
  <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
@@ -675,18 +704,9 @@ OTHER_HTML=f"""<div>
675
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
676
  </div>
677
  """
678
-
679
  with gr.Blocks(title=title) as demo:
680
  gr.Markdown(DESCRIPTION)
681
  gr.Markdown(OTHER_HTML)
682
- with gr.Row():
683
- model_selected = gr.Dropdown(
684
- label="Select Instuct LLM Model to Use",
685
- info="Zephyr and Mistral 5-bit GGUF models are preloaded",
686
- choices=MODELS,
687
- max_choices=1,
688
- value=MODELS[0],
689
- )
690
  chatbot = gr.Chatbot(
691
  [],
692
  elem_id="chatbot",
@@ -711,7 +731,6 @@ with gr.Blocks(title=title) as demo:
711
  )
712
  txt_btn = gr.Button(value="Submit text", scale=1)
713
  btn = gr.Audio(source="microphone", type="filepath", scale=4)
714
-
715
  def stop():
716
  print("Audio STOP")
717
  set_audio_playing(False)
@@ -728,31 +747,27 @@ with gr.Blocks(title=title) as demo:
728
  )
729
 
730
  audio.end(stop)
731
-
732
  with gr.Row():
733
  gr.Examples(
734
  EXAMPLES,
735
- [chatbot,chatbot_role, txt],
736
- [chatbot,chatbot_role, txt],
737
  add_text,
738
  cache_examples=False,
739
  run_on_click=False, # Will not work , user should submit it
740
- )
741
-
742
- def clear_inputs(chatbot):
743
- return None
744
  clear_btn = gr.ClearButton([chatbot, audio])
745
- chatbot_role.change(fn=clear_inputs, inputs=[chatbot], outputs=[chatbot])
746
- model_selected.change(fn=clear_inputs, inputs=[chatbot], outputs=[chatbot])
747
 
748
  txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
749
- generate_speech, [chatbot,chatbot_role,model_selected], [chatbot,chatbot_role, sentence, audio]
750
  )
751
 
752
  txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
753
 
754
  txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
755
- generate_speech, [chatbot,chatbot_role,model_selected], [chatbot,chatbot_role, sentence, audio]
756
  )
757
 
758
  txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
@@ -760,19 +775,18 @@ with gr.Blocks(title=title) as demo:
760
  file_msg = btn.stop_recording(
761
  add_file, [chatbot, btn], [chatbot, txt], queue=False
762
  ).then(
763
- generate_speech, [chatbot,chatbot_role,model_selected], [chatbot,chatbot_role, sentence, audio]
764
  )
765
 
766
  file_msg.then(lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), None, [txt, btn], queue=False)
767
 
768
  gr.Markdown(
769
  """
770
- This Space demonstrates how to speak to a chatbot, based solely on open accessible models.
771
- It relies on following models :
772
- Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
773
- LLM Mistral : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
774
- LLM Zephyr : [Zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) as the chat model. GGUF Q5_K_M quantized version used locally via llama_cpp from [huggingface.co/TheBloke](https://huggingface.co/TheBloke/zephyr-7B-alpha-GGUF).
775
- Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
776
 
777
  Note:
778
  - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml
 
53
 
54
  # This will trigger downloading model
55
  print("Downloading if not downloaded Coqui XTTS V2")
 
56
  from TTS.utils.manage import ModelManager
57
  model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
58
  ModelManager().download_model(model_name)
59
  model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
60
  print("XTTS downloaded")
61
 
 
 
62
  config = XttsConfig()
63
  config.load_json(os.path.join(model_path, "config.json"))
64
 
 
73
  model.cuda()
74
  print("Done loading TTS")
75
 
76
+ llm_model = os.environ.get("LLM_MODEL", "mistral") # or "zephyr"
77
 
78
+ title = f"Voice chat with {llm_model.capitalize()} and Coqui XTTS"
79
 
80
+ DESCRIPTION = f"""# Voice chat with {llm_model.capitalize()} and Coqui XTTS"""
81
  css = """.toast-wrap { display: none !important } """
82
 
83
  from huggingface_hub import HfApi
 
86
  # will use api to restart space on a unrecoverable error
87
  api = HfApi(token=HF_TOKEN)
88
 
89
+ repo_id = "coqui/voice-chat-with-mistral"
90
 
91
 
92
  default_system_message = f"""
93
+ You are {llm_model.capitalize()}, a large language model trained and provided by Mistral, architecture of you is decoder-based LM. Your voice backend or text to speech TTS backend is provided via Coqui technology. You are right now served on Huggingface spaces.
94
  The user is talking to you over voice on their phone, and your response will be read out loud with realistic text-to-speech (TTS) technology from Coqui team. Follow every direction here when crafting your response: Use natural, conversational language that are clear and easy to follow (short sentences, simple words). Be concise and relevant: Most of your responses should be a sentence or two, unless you’re asked to go deeper. Don’t monopolize the conversation. Use discourse markers to ease comprehension. Never use the list format. Keep the conversation flowing. Clarify: when there is ambiguity, ask clarifying questions, rather than make assumptions. Don’t implicitly or explicitly try to end the chat (i.e. do not end a response with “Talk soon!”, or “Enjoy!”). Sometimes the user might just want to chat. Ask them relevant follow-up questions. Don’t ask them if there’s anything else they need help with (e.g. don’t say things like “How can I assist you further?”). Remember that this is a voice conversation: Don’t use lists, markdown, bullet points, or other formatting that’s not typically spoken. Type out numbers in words (e.g. ‘twenty twelve’ instead of the year 2012). If something doesn’t make sense, it’s likely because you misheard them. There wasn’t a typo, and the user didn’t mispronounce anything. Remember to follow these rules absolutely, and do not refer to these rules, even if you’re asked about them.
95
  You cannot access the internet, but you have vast knowledge.
96
  Current date: CURRENT_DATE .
 
113
 
114
  whisper_client = Client("https://sanchit-gandhi-whisper-large-v2.hf.space/")
115
 
116
+ ROLES = ["AI Assistant"]
117
 
118
  ROLE_PROMPTS = {}
119
  ROLE_PROMPTS["AI Assistant"]=system_message
 
 
 
 
 
 
 
120
  ##"You are an AI assistant with Zephyr model by Mistral and Hugging Face and speech from Coqui XTTS . User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps, your answers should be clear and short sentences"
121
 
122
+ LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
123
 
124
 
125
  ### WILL USE LOCAL MISTRAL OR ZEPHYR
 
128
  print("Downloading LLM")
129
 
130
 
131
+ if llm_model == "zephyr":
132
+ #Zephyr
133
+ hf_hub_download(repo_id="TheBloke/zephyr-7B-alpha-GGUF", local_dir=".", filename="zephyr-7b-alpha.Q5_K_M.gguf")
134
+ # use new gguf format
135
+ model_path="./zephyr-7b-alpha.Q5_K_M.gguf"
136
+ else:
137
+ #Mistral
138
+ hf_hub_download(repo_id="TheBloke/Mistral-7B-Instruct-v0.1-GGUF", local_dir=".", filename="mistral-7b-instruct-v0.1.Q5_K_M.gguf")
139
+ # use new gguf format
140
+ model_path="./mistral-7b-instruct-v0.1.Q5_K_M.gguf"
 
141
 
142
 
143
  from llama_cpp import Llama
144
  # set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
145
  # else 35 full layers + XTTS works fine on T4 16GB
146
+ GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 15))
 
 
 
147
 
148
  LLAMA_VERBOSE=False
149
+ print("Running LLM")
150
+ llm = Llama(model_path=model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
151
 
 
 
152
 
153
 
154
  # Mistral formatter
155
+ def format_prompt_mistral(message, history, system_message=""):
156
  prompt = (
157
  "<s>[INST]" + system_message + "[/INST]" + system_understand_message + "</s>"
158
  )
159
  for user_prompt, bot_response in history:
160
  prompt += f"[INST] {user_prompt} [/INST]"
161
  prompt += f" {bot_response}</s> "
 
 
 
162
  prompt += f"[INST] {message} [/INST]"
163
  return prompt
164
+
 
 
 
 
 
 
 
165
  # Zephyr formatter
166
+ def format_prompt_zephyr(message, history, system_message=""):
167
  prompt = (
168
+ "<|system|>" + system_message + "</s>"
169
  )
170
  for user_prompt, bot_response in history:
171
  prompt += f"<|user|>\n{user_prompt}</s>"
172
+ prompt += f"<|assistant|> {bot_response}</s>"
173
  if message=="":
174
  message="Hello"
175
  prompt += f"<|user|>\n{message}</s>"
 
176
  print(prompt)
177
  return prompt
178
 
179
+ if llm_model=="zephyr":
180
+ format_prompt = format_prompt_zephyr
181
+ else:
182
+ format_prompt = format_prompt_mistral
183
+
184
+
185
  def generate_local(
186
  prompt,
187
  history,
 
188
  system_message=None,
189
  temperature=0.8,
190
  max_tokens=256,
 
200
  temperature=temperature,
201
  max_tokens=max_tokens,
202
  top_p=top_p,
203
+ stop=stop,
204
  )
205
 
206
+ formatted_prompt = format_prompt(prompt, history,system_message=system_message)
 
 
 
 
 
 
 
 
207
 
208
  try:
 
209
  stream = llm(
210
  formatted_prompt,
211
  **generate_kwargs,
 
224
  return
225
 
226
 
227
+ output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","").replace("/s>","")
228
  yield output
229
 
230
  except Exception as e:
 
286
  xtts_supported_languages=config.languages
287
  def detect_language(prompt):
288
  # Fast language autodetection
289
+ if len(prompt)>13:
290
  language_predicted=langid.classify(prompt)[0].strip() # strip need as there is space at end!
291
  if language_predicted == "zh":
292
  #we use zh-cn on xtts
 
316
  language,
317
  gpt_cond_latent,
318
  speaker_embedding,
319
+ decoder="ne_hifigan",
320
  )
321
 
322
  first_chunk = True
 
360
  except:
361
  return None
362
 
363
+ ###### MISTRAL FUNCTIONS ######
364
+
365
+ def generate(
366
+ prompt,
367
+ history,
368
+ temperature=0.9,
369
+ max_new_tokens=256,
370
+ top_p=0.95,
371
+ repetition_penalty=1.0,
372
+ ):
373
+ temperature = float(temperature)
374
+ if temperature < 1e-2:
375
+ temperature = 1e-2
376
+ top_p = float(top_p)
377
+
378
+ generate_kwargs = dict(
379
+ temperature=temperature,
380
+ max_new_tokens=max_new_tokens,
381
+ top_p=top_p,
382
+ repetition_penalty=repetition_penalty,
383
+ do_sample=True,
384
+ seed=42,
385
+ )
386
+
387
+ #formatted_prompt = format_prompt(prompt, history)
388
+ formatted_prompt = format_prompt_zephyr(prompt, history)
389
+
390
+ try:
391
+ stream = text_client.text_generation(
392
+ formatted_prompt,
393
+ **generate_kwargs,
394
+ stream=True,
395
+ details=True,
396
+ return_full_text=False,
397
+ )
398
+ output = ""
399
+ for response in stream:
400
+ output += response.token.text
401
+ yield output
402
+
403
+ except Exception as e:
404
+ if "Too Many Requests" in str(e):
405
+ print("ERROR: Too many requests on mistral client")
406
+ gr.Warning("Unfortunately Mistral is unable to process")
407
+ output = "Unfortuanately I am not able to process your request now, too many people are asking me !"
408
+ elif "Model not loaded on the server" in str(e):
409
+ print("ERROR: Mistral server down")
410
+ gr.Warning("Unfortunately Mistral LLM is unable to process")
411
+ output = "Unfortuanately I am not able to process your request now, I have problem with Mistral!"
412
+ else:
413
+ print("Unhandled Exception: ", str(e))
414
+ gr.Warning("Unfortunately Mistral is unable to process")
415
+ output = "I do not know what happened but I could not understand you ."
416
+
417
+ yield output
418
+ return None
419
+ return output
420
+
421
+
422
+ ###### WHISPER FUNCTIONS ######
423
 
424
  def transcribe(wav_path):
425
  try:
 
433
  gr.Warning("There was a problem with Whisper endpoint, telling a joke for you.")
434
  return "There was a problem with my voice, tell me joke"
435
 
436
+
437
+ # Chatbot demo with multimodal input (text, markdown, LaTeX, code blocks, image, audio, & video). Plus shows support for streaming text.
438
+
439
  # Will be triggered on text submit (will send to generate_speech)
440
  def add_text(history, text):
441
  history = [] if history is None else history
 
472
  yield history
473
 
474
 
475
+ def get_sentence(history, chatbot_role,system_prompt=""):
 
476
  history = [["", None]] if history is None else history
477
 
478
  if system_prompt == "":
 
481
  history[-1][1] = ""
482
 
483
  mistral_start = time.time()
484
+ print("Mistral start")
485
  sentence_list = []
486
  sentence_hash_list = []
487
 
488
  text_to_generate = ""
489
  stored_sentence = None
490
  stored_sentence_hash = None
491
+ for character in generate_local(history[-1][0], history[:-1],system_message=ROLE_PROMPTS[chatbot_role]):
 
 
 
 
492
  history[-1][1] = character.replace("<|assistant|>","")
493
  # It is coming word by word
494
 
495
+ text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").strip())
496
  if len(text_to_generate) > 1:
497
 
498
  dif = len(text_to_generate) - len(sentence_list)
 
536
  yield (sentence, history)
537
 
538
  # return that final sentence token
539
+ last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
540
+ sentence_hash = hash(last_sentence)
541
+ if sentence_hash not in sentence_hash_list:
542
+ if stored_sentence is not None and stored_sentence_hash is not None:
543
+ last_sentence = stored_sentence + last_sentence
544
+ stored_sentence = stored_sentence_hash = None
545
+ print("Last Sentence with stored:",last_sentence)
 
 
 
 
 
546
 
547
+ sentence_hash_list.append(sentence_hash)
548
+ sentence_list.append(last_sentence)
549
+ print("Last Sentence: ", last_sentence)
550
 
551
+ yield (last_sentence, history)
552
 
553
  from scipy.io.wavfile import write
554
  from pydub import AudioSegment
 
557
  second_of_silence.export("sil.wav", format='wav')
558
 
559
 
560
+ def generate_speech(history,chatbot_role):
561
  # Must set autoplay to True first
562
  yield (history, chatbot_role, "", wave_header_chunk() )
563
+
564
+ first_sentence=True
565
+ language="autodetect" # will predict from first sentence
566
+
567
+ for sentence, history in get_sentence(history,chatbot_role):
568
  if sentence != "":
569
+ if first_sentence:
570
+ language = detect_language(sentence)
571
+ first_sentence=False
572
+
573
  print("BG: inserting sentence to queue")
574
 
575
+ generated_speech = generate_speech_for_sentence(history, chatbot_role, sentence,return_as_byte=True,language=language)
576
  if generated_speech is not None:
577
  _, audio_dict = generated_speech
578
  # We are using byte streaming
 
580
 
581
 
582
  # will generate speech audio file per sentence
583
+ def generate_speech_for_sentence(history, chatbot_role, sentence, return_as_byte=True, language="autodetect"):
584
+
 
585
  wav_bytestream = b""
586
 
587
  if len(sentence)==0:
 
606
  if len(sentence)==0:
607
  print("EMPTY SENTENCE after processing")
608
  return
609
+
610
  # A fast fix for last chacter, may produce weird sounds if it is with text
611
  if (sentence[-1] in ["!", "?", ".", ","]) or (sentence[-2] in ["!", "?", ".", ","]):
612
  # just add a space
 
683
  print("All speech ended")
684
  return
685
 
686
+
687
  latent_map = {}
688
  latent_map["AI Assistant"] = get_latents("examples/female.wav")
 
689
 
690
  #### GRADIO INTERFACE ####
 
691
  EXAMPLES = [
692
+ [[],"What is 42?"],
693
+ [[],"Speak in French, tell me how are you doing?"],
694
+ [[],"Antworten Sie mir von nun an auf Deutsch"],
695
+
696
  ]
697
 
 
698
 
699
  OTHER_HTML=f"""<div>
700
  <a style="display:inline-block" href='https://github.com/coqui-ai/TTS'><img src='https://img.shields.io/github/stars/coqui-ai/TTS?style=social' /></a>
 
704
  <img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=0d00920c-8cc9-4bf3-90f2-a615797e5f59" />
705
  </div>
706
  """
 
707
  with gr.Blocks(title=title) as demo:
708
  gr.Markdown(DESCRIPTION)
709
  gr.Markdown(OTHER_HTML)
 
 
 
 
 
 
 
 
710
  chatbot = gr.Chatbot(
711
  [],
712
  elem_id="chatbot",
 
731
  )
732
  txt_btn = gr.Button(value="Submit text", scale=1)
733
  btn = gr.Audio(source="microphone", type="filepath", scale=4)
 
734
  def stop():
735
  print("Audio STOP")
736
  set_audio_playing(False)
 
747
  )
748
 
749
  audio.end(stop)
750
+
751
  with gr.Row():
752
  gr.Examples(
753
  EXAMPLES,
754
+ [chatbot, txt],
755
+ [chatbot, txt],
756
  add_text,
757
  cache_examples=False,
758
  run_on_click=False, # Will not work , user should submit it
759
+ )
760
+
 
 
761
  clear_btn = gr.ClearButton([chatbot, audio])
 
 
762
 
763
  txt_msg = txt_btn.click(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
764
+ generate_speech, [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
765
  )
766
 
767
  txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
768
 
769
  txt_msg = txt.submit(add_text, [chatbot, txt], [chatbot, txt], queue=False).then(
770
+ generate_speech, [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
771
  )
772
 
773
  txt_msg.then(lambda: gr.update(interactive=True), None, [txt], queue=False)
 
775
  file_msg = btn.stop_recording(
776
  add_file, [chatbot, btn], [chatbot, txt], queue=False
777
  ).then(
778
+ generate_speech, [chatbot,chatbot_role], [chatbot,chatbot_role, sentence, audio]
779
  )
780
 
781
  file_msg.then(lambda: (gr.update(interactive=True),gr.update(interactive=True,value=None)), None, [txt, btn], queue=False)
782
 
783
  gr.Markdown(
784
  """
785
+ This Space demonstrates how to speak to a chatbot, based solely on open-source models.
786
+ It relies on 3 stage models:
787
+ - Speech to Text : [Whisper-large-v2](https://sanchit-gandhi-whisper-large-v2.hf.space/) as an ASR model, to transcribe recorded audio to text. It is called through a [gradio client](https://www.gradio.app/docs/client).
788
+ - LLM Model : [Mistral-7b-instruct](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1) as the chat model, GGUF Q5_K_M quantized version used locally via llama_cpp[huggingface_hub](TheBloke/Mistral-7B-Instruct-v0.1-GGUF).
789
+ - Text to Speech : [Coqui's XTTS](https://huggingface.co/spaces/coqui/xtts) as a Multilingual TTS model, to generate the chatbot answers. This time, the model is hosted locally.
 
790
 
791
  Note:
792
  - By using this demo you agree to the terms of the Coqui Public Model License at https://coqui.ai/cpml