Mahadih534 commited on
Commit
330d308
·
verified ·
1 Parent(s): a3847ed

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ import logging
3
+ import sys
4
+ import torch
5
+ from torch import cuda
6
+ import gradio as gr
7
+ from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt
8
+ from llama_index.llms import LlamaCPP
9
+ from llama_index.llms.llama_utils import (
10
+ messages_to_prompt,
11
+ completion_to_prompt,
12
+ )
13
+
14
+ MODELS_PATH = "./models"
15
+
16
+ mistral_model_path = hf_hub_download(
17
+ repo_id= "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
18
+ filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf",
19
+ resume_download=True,
20
+ cache_dir=MODELS_PATH,)
21
+
22
+
23
+
24
+ """Step 3 : if you use GPU then make sure ( n_gpu_layers":1) at least 1, you can increase or decrease it based on your GPU performance"""
25
+
26
+ llm = LlamaCPP(
27
+ # You can pass in the URL to a GGML model to download it automatically
28
+ # model_url=model_url,
29
+ # optionally, you can set the path to a pre-downloaded model instead of model_url
30
+ model_path=mistral_model_path,
31
+ temperature=0.1,
32
+ max_new_tokens=256,
33
+ # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
34
+ context_window=3900,
35
+ # kwargs to pass to __call__()
36
+ generate_kwargs={},
37
+ # kwargs to pass to __init__()
38
+ # set to at least 1 to use GPU
39
+ model_kwargs={"n_gpu_layers": -1},
40
+ # transform inputs into Llama2 format
41
+ messages_to_prompt=messages_to_prompt,
42
+ completion_to_prompt=completion_to_prompt,
43
+ verbose=True,
44
+ )
45
+
46
+
47
+ def model_initialization(model):
48
+ if(model !=""):
49
+ gr.Info("model downloading and configuration process has been started, please wait...")
50
+ MODELS_PATH = "./models"
51
+ repo_id=""
52
+ filename=""
53
+ if(model=="Llama-2-13B-chat"):
54
+ repo_id="TheBloke/Llama-2-13B-chat-GGUF"
55
+ filename="llama-2-13b-chat.Q4_K_M.gguf"
56
+ elif(model=="Mistral-7B-Instruct-v0.2") :
57
+ repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
58
+ filename="mistral-7b-instruct-v0.2.Q4_K_M.gguf"
59
+ elif(model=="zephyr-7B-beta"):
60
+ repo_id="TheBloke/zephyr-7B-beta-GGUF "
61
+ filename="zephyr-7b-beta.Q4_K_M.gguf"
62
+ elif(model=="vicuna-7B-v1.5"):
63
+ repo_id="TheBloke/vicuna-7B-v1.5-GGUF"
64
+ filename="vicuna-7b-v1.5.Q4_K_M.gguf"
65
+ elif(model=="Falcon-7B-Instruct"):
66
+ repo_id="TheBloke/Falcon-7B-Instruct-GGML"
67
+ filename="falcon-7b-instruct.ggccv1.q4_1.bin"
68
+ elif(model=="CodeLlama-7B"):
69
+ repo_id="TheBloke/CodeLlama-7B-GGUF"
70
+ filename="codellama-7b.Q4_K_M.gguf"
71
+ else:
72
+ gr.Warning("please select at least one model")
73
+
74
+
75
+ mistral_model_path = hf_hub_download(
76
+ repo_id= repo_id,
77
+ filename= filename,
78
+ resume_download=True,
79
+ cache_dir=MODELS_PATH,)
80
+
81
+ llm = LlamaCPP(
82
+ # You can pass in the URL to a GGML model to download it automatically
83
+ # model_url=model_url,
84
+ # optionally, you can set the path to a pre-downloaded model instead of model_url
85
+ model_path=mistral_model_path,
86
+ temperature=0.1,
87
+ max_new_tokens=256,
88
+ # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
89
+ context_window=3900,
90
+ # kwargs to pass to __call__()
91
+ generate_kwargs={},
92
+ # set to at least 1 to use GPU
93
+ model_kwargs={"n_gpu_layers": -1},
94
+ # transform inputs into Llama2 format
95
+ messages_to_prompt=messages_to_prompt,
96
+ completion_to_prompt=completion_to_prompt,
97
+ verbose=True,
98
+ )
99
+ gr.Info("model has been configured and ready to chat")
100
+ return "model has been configured and ready to chat, your current model is "+model
101
+
102
+ def predict(message, history):
103
+ messages = []
104
+ answer = []
105
+ response = llm.stream_complete(message)
106
+ for bot_response in response:
107
+ token = bot_response.delta
108
+ answer.append(token)
109
+ final_answer = " ".join(answer)
110
+ yield final_answer
111
+
112
+ with gr.Blocks() as UI:
113
+
114
+ models=gr.Dropdown(["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
115
+ "vicuna-7B-v1.5"],value=["CodeLlama-7B","Llama-2-13B-chat","Falcon-7B-Instruct" "Mistral-7B-Instruct-v0.2", "zephyr-7B-beta",
116
+ "vicuna-7B-v1.5"], label="please select at least one model", info="default model is Mistral-7B-Instruct-v0.2")
117
+ textInfo = gr.Textbox(value="current model is Mistral-7B-Instruct-v0.2",label="Model Status");
118
+ # Chatbot interface
119
+ chatUI= gr.ChatInterface(
120
+ predict,
121
+ title="Open Source LLM ChatBot",
122
+ description="Ask any question",
123
+ theme="soft",
124
+ examples=["Hello", "are you LLM model?", "how can i finetune a pre-trained LLM model?","How can i build a chatbot using local open-souce LLM ?"],
125
+ cache_examples=False,
126
+ submit_btn="Send Message",
127
+ retry_btn=None,
128
+ undo_btn="Delete Previous",
129
+ clear_btn="Clear",
130
+ )
131
+
132
+ models.change(fn=model_initialization,inputs=[models],outputs=[textInfo])
133
+
134
+ if __name__ == "__main__":
135
+ UI.launch(debug=True) # launch app