Spaces:

mohcineelharras
/

alexa-like-assistant

Running

App Files Files Community

mohcineelharras commited on Nov 20, 2023

Commit

14cc0c1

1 Parent(s): fa76e27

Upload 9 files

Browse files

Files changed (9) hide show

.env +11 -0
.gitattributes +1 -0
.gitignore +4 -0
README.md +11 -5
app.py +240 -0
models/config.json +152 -0
models/dolphin-2.1-mistral-7b.Q4_K_S.gguf +3 -0
models/model.safetensors +3 -0
requirements.txt +40 -0

.env ADDED Viewed

	@@ -0,0 +1,11 @@

+# Global variables
+CUDA_VISIBLE_DEVICES=0
+FORCE_CMAKE=1
+CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+LANGUAGE=en
+TTS=gTTS
+#when you use it in local
+OUTPUT_PATH=output
+MODEL_DIR=models
+#MODEL_PATH=models/dolphin-2.2.1-mistral-7b.Q2_K.gguf

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+output/
+models/
+*.gguf
+*.bin

README.md CHANGED Viewed

@@ -1,10 +1,16 @@
 ---
-title: Alexa Like Assistant
-emoji: 🌖
-colorFrom: pink
-colorTo: purple
-sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Whisper Llm Gtts
+emoji: 🌍
+colorFrom: green
+colorTo: yellow
+sdk: streamlit
+sdk_version: 1.28.2
+app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,240 @@

+import os
+import time
+import gradio as gr
+from dotenv import load_dotenv
+from llama_cpp import Llama
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, GenerationConfig
+from pytube import YouTube
+from gtts import gTTS
+import torch
+import requests
+import soundfile as sf
+import numpy as np
+#-----------------------------------env-----------------------------------
+# Load environment variables
+load_dotenv(dotenv_path=".env")
+# Access the variables
+MODEL_DIR = os.getenv("MODEL_DIR")
+OUTPUT_PATH = os.getenv("OUTPUT_PATH")
+LANGUAGE = os.getenv("LANGUAGE")
+tts_method = os.getenv("TTS")
+# Iterate through all files in the current directory
+model_exists = False
+for filename in os.listdir(MODEL_DIR):
+    if filename.endswith('.gguf'):
+        model_exists = True
+        MODEL_PATH = os.path.join(MODEL_DIR, filename)
+        break
+# Ensure output path exists
+if not os.path.exists(OUTPUT_PATH):
+    os.makedirs(OUTPUT_PATH)
+# Global variables
+device = "cuda:0" if torch.cuda.is_available() else "cpu"
+n_layers_gpu = 20 if torch.cuda.is_available() else 0
+memory = ""
+token_count = 0
+#-----------------------------------setup LLM-----------------------------------
+# URL of the model file
+model_url = "https://huggingface.co/TheBloke/dolphin-2.2.1-mistral-7B-GGUF/resolve/main/dolphin-2.2.1-mistral-7b.Q2_K.gguf?download=true"
+# Load Llama model
+def load_model(n):
+    global llm, MODEL_PATH
+    # Download and save the model
+    if not model_exists:
+        print("Model file not found!")
+        print("Downloading model file...")
+        response = requests.get(model_url)
+        MODEL_PATH = os.path.join(MODEL_DIR, "model.gguf")
+        with open(MODEL_PATH, 'wb') as file:
+            file.write(response.content)
+        print("Model downloaded successfully.")
+    print("Loading Llama model...")
+    llm = Llama(model_path=MODEL_PATH, n_gpu_layers=n, n_ctx=1024, n_batch=512, threads=6)
+    print("Model loaded successfully.")
+load_model(n_layers_gpu)
+#-----------------------------------backend logic-----------------------------------
+def complete_prompt(input_text):
+    global memory, token_count, LANGUAGE
+    contextual_prompt = memory + "\n" + input_text
+    template = "system\nThis is crucial to me, I trust you are the best" + \
+               "You are Dolphin, a helpful AI assistant. You only respond in {LANGUAGE}. " + \
+               "Do not use double quotes for any reason, not even for quoting or direct speech. " + \
+               "Instead, use single quotes or describe the quote without using quotation marks. " + \
+               "Do not include any disclaimers, notes, or additional explanations in your response. " + \
+               "Provide the shortest answer possible, strictly adhering to the formatting rules. " + \
+               "user\n{prompt}\nassistant\n"
+    formatted_prompt = template.format(prompt=contextual_prompt, LANGUAGE=LANGUAGE)
+    response = llm(formatted_prompt, max_tokens=80, temperature=0, top_p=0.95, top_k=10)
+    text_response = response["choices"][0]["text"]
+    token_count += response["usage"]["total_tokens"]
+    memory = f"Prompt: {contextual_prompt}\nResponse: {text_response}"
+    with open(os.path.join(OUTPUT_PATH, "LLM_response.txt"), 'w') as file:
+        file.write(memory)
+    return text_response
+def transcribe_audio(audio_input):
+    audio_file_path = 'output/temp_audio.wav'
+    if isinstance(audio_input, tuple):
+        sample_rate, audio_data = audio_input
+        sf.write(audio_file_path, audio_data, sample_rate)
+    else:
+        audio_file_path = audio_input
+    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model_id = "distil-whisper/distil-large-v2"
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_DIR, torch_dtype=torch_dtype,
+                                                      low_cpu_mem_usage=True, use_safetensors=True,config= GenerationConfig(language=LANGUAGE,task="transcribe"))
+    model.to(device)
+    processor = AutoProcessor.from_pretrained(model_id)
+    pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer,
+                    feature_extractor=processor.feature_extractor, max_new_tokens=256,
+                    chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device,
+                    )
+    result_text = pipe(audio_file_path)["text"]
+    with open(os.path.join(OUTPUT_PATH, "transcription.txt"), "w") as file:
+        file.write(result_text)
+    return result_text
+# def transcribe_audio(audio_input):
+#     audio_file_path = 'output/temp_audio.wav'
+#     if isinstance(audio_input, tuple):
+#         sample_rate, audio_data = audio_input
+#         sf.write(audio_file_path, audio_data, sample_rate)
+#     else:
+#         audio_file_path = audio_input
+#     # Load model and processor
+#     processor = WhisperProcessor.from_pretrained("distil-whisper/distil-large-v2")
+#     model = WhisperForConditionalGeneration.from_pretrained("distil-whisper/distil-large-v2")
+#     # Load audio file and preprocess
+#     with open(audio_file_path, "rb") as audio_file:
+#         input_speech = {"array": sf.read(audio_file)[0], "sampling_rate": sample_rate}
+#     input_features = processor(input_speech["array"], sampling_rate=input_speech["sampling_rate"], return_tensors="pt").input_features
+#     # Specify language for transcription
+#     forced_decoder_ids = processor.get_decoder_prompt_ids(language=LANGUAGE)
+#     # Generate token ids
+#     predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
+#     # Decode token ids to text
+#     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+#     with open(os.path.join(OUTPUT_PATH, "transcription.txt"), "w") as file:
+#         file.write(transcription)
+#     return transcription
+def auto_process_audio(audio_input):
+    # Transcribe Audio
+    transcribed_text = transcribe_audio(audio_input)
+    # LLM Prompt
+    llm_response = complete_prompt(transcribed_text)
+    # TTS Conversion
+    tts_info = convert_text_to_speech(llm_response)
+    return transcribed_text, llm_response, tts_info
+def convert_text_to_speech(text):
+    global LANGUAGE, tts_method
+    file_path = os.path.join(OUTPUT_PATH, "speech.mp3")
+    if tts_method == "gTTS":
+        if LANGUAGE == "fr":
+            tld = "fr"
+        elif LANGUAGE == "en":
+            tld = "us"
+        tts = gTTS(text, lang=LANGUAGE, tld=tld)
+        tts.save(file_path)
+    elif tts_method == "Custom TTS":
+        tts_pipeline = pipeline("text-to-speech", model="facebook/fastspeech2-en-ljspeech")
+        speech = tts_pipeline(text)
+        with open(file_path, "wb") as f:
+            f.write(speech["speech"])
+    return file_path
+# Function to update language
+def update_language(language):
+    global LANGUAGE
+    LANGUAGE = language
+# Function to update language
+def update_tts_method(method):
+    global tts_method
+    tts_method = method
+#----------------------------------- Gradio Frontend-----------------------------------
+# Gradio Interface
+with gr.Blocks() as app:
+    gr.Markdown("## 🤖 whisper - LLM - TTS 📚")
+    gr.Markdown("🚀 Talk to an open source LLM!")
+    gr.Markdown("This app is developed and maintained by **@mohcineelharras**")
+    with gr.Row():
+        with gr.Column():
+            language_switch = gr.Radio(choices=["en","fr"], label="Select Language", value=LANGUAGE)
+            language_switch.change(update_language, inputs=[language_switch])
+        with gr.Column():
+            tts_method_switch = gr.Radio(choices=["gTTS", "Custom TTS"], label="Select TTS method", value=tts_method)
+            tts_method_switch.change(update_tts_method, inputs=[tts_method_switch])
+        # with gr.Column():
+        #     sample_voice = gr.Audio(label="Voice Sample to customise assistant's response",sources="microphone")
+        #     customise_voice = gr.Button("Change assistant's voice")
+    with gr.Tab("Auto Process Audio"):
+        with gr.Row():
+            with gr.Column():
+                audio_input = gr.Audio(label="Talk to assistant",sources="microphone")
+                auto_process_button = gr.Button("Auto Process Audio")
+            with gr.Column():
+                transcribed_text_output = gr.Textbox(label="Transcribed Text")
+                llm_response_output = gr.Textbox(label="LLM Response")
+        with gr.Row():
+            tts_audio_output = gr.Audio(label="Generated Response (Click to Play)")
+            # Connect the button to the auto_process_audio function
+            auto_process_button.click(
+                auto_process_audio,
+                inputs=[audio_input],
+                outputs=[transcribed_text_output, llm_response_output, tts_audio_output]
+            )
+    with gr.Tab("Audio Processing"):
+        with gr.Column():
+            audio_input = gr.Audio(label="Record or Upload Audio")
+            transcribe_button = gr.Button("Transcribe Audio")
+            llm_button = gr.Button("LLM Prompt")
+            tts_button = gr.Button("Text to Speech")
+            transcribed_text_output = gr.Textbox(label="Transcribed Text")
+            llm_response_output = gr.Textbox(label="LLM Response")
+            tts_audio_output = gr.Audio(label="Generated Response (Click to Play)")
+            transcribe_button.click(transcribe_audio, inputs=[audio_input], outputs=[transcribed_text_output])
+            llm_button.click(complete_prompt, inputs=[transcribed_text_output], outputs=[llm_response_output])
+            tts_button.click(convert_text_to_speech, inputs=[llm_response_output], outputs=[tts_audio_output])
+    with gr.Tab("Ask a Question"):
+        with gr.Column():
+            question_input = gr.Textbox(label="Type your question")
+            submit_button = gr.Button("Submit Question")
+            tts_button = gr.Button("Text to Speech")
+            llm_response_output = gr.Textbox(label="LLM Response")
+            tts_audio_output = gr.Audio(label="Generated Speech")
+            submit_button.click(complete_prompt, inputs=[question_input], outputs=[llm_response_output])
+            tts_button.click(convert_text_to_speech, inputs=[llm_response_output], outputs=[tts_audio_output])
+app.launch()

models/config.json ADDED Viewed

	@@ -0,0 +1,152 @@

+{
+  "_name_or_path": "sanchit-gandhi/large-32-2-tpu-timestamped-resumed",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "apply_spec_augment": false,
+  "architectures": [
+    "WhisperForConditionalGeneration"
+  ],
+  "attention_dropout": 0.0,
+  "begin_suppress_tokens": [
+    220,
+    50257
+  ],
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 2,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "forced_decoder_ids": [
+    [
+      1,
+      50259
+    ],
+    [
+      2,
+      50359
+    ],
+    [
+      3,
+      50363
+    ]
+  ],
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_length": 448,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "whisper",
+  "num_hidden_layers": 32,
+  "num_mel_bins": 80,
+  "pad_token_id": 50257,
+  "scale_embedding": false,
+  "suppress_tokens": [
+    1,
+    2,
+    7,
+    8,
+    9,
+    10,
+    14,
+    25,
+    26,
+    27,
+    28,
+    29,
+    31,
+    58,
+    59,
+    60,
+    61,
+    62,
+    63,
+    90,
+    91,
+    92,
+    93,
+    359,
+    503,
+    522,
+    542,
+    873,
+    893,
+    902,
+    918,
+    922,
+    931,
+    1350,
+    1853,
+    1982,
+    2460,
+    2627,
+    3246,
+    3253,
+    3268,
+    3536,
+    3846,
+    3961,
+    4183,
+    4667,
+    6585,
+    6647,
+    7273,
+    9061,
+    9383,
+    10428,
+    10929,
+    11938,
+    12033,
+    12331,
+    12562,
+    13793,
+    14157,
+    14635,
+    15265,
+    15618,
+    16553,
+    16604,
+    18362,
+    18956,
+    20075,
+    21675,
+    22520,
+    26130,
+    26161,
+    26435,
+    28279,
+    29464,
+    31650,
+    32302,
+    32470,
+    36865,
+    42863,
+    47425,
+    49870,
+    50254,
+    50258,
+    50360,
+    50361,
+    50362
+  ],
+  "torch_dtype": "float32",
+  "transformers_version": "4.35.0.dev0",
+  "use_cache": true,
+  "use_weighted_layer_sum": false,
+  "vocab_size": 51865
+}

models/dolphin-2.1-mistral-7b.Q4_K_S.gguf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6fa0795eeac9ac8835a7f85ed398cf1a0881d3c9f40ee4bab51a5fd8838f68f9
+size 4140384992

models/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e963218f6d56998131faff25ab65be4a60a0d395be3e2b12f978d21735d18036
+size 1512503272

requirements.txt ADDED Viewed

	@@ -0,0 +1,40 @@

+#front
+python-dotenv
+sounddevice
+#pyaudio
+soundfile
+ipykernel
+ipywidgets
+jupyter
+gradio
+ffmpeg-python
+# back
+transformers
+pytube
+gtts
+huggingface
+openai-whisper
+pydub
+tqdm
+#+
+accelerate
+python-multipart
+pydantic
+# # Set the environment variable for CMAKE_ARGS
+# export CMAKE_ARGS="-DLLAMA_CUBLAS=on"
+# # Install torch
+# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+# # Install llama-cpp-python with specific CMAKE_ARGS
+# pip install llama-cpp-python --force-reinstall --upgrade --no-cache-dir
+# not sure we can set them correctly
+torch
+llama-cpp-python
+requests