File size: 5,723 Bytes
26a8369 eefa060 26a8369 ac18c71 26a8369 4ff0368 26a8369 c3b8348 358544d 801fec7 3debf7a 26a8369 0430bc7 26a8369 3debf7a 801fec7 1074fa0 801fec7 3debf7a 4ff0368 2efeef1 801fec7 4ff0368 801fec7 26a8369 9c2d729 26a8369 a66261e 26a8369 801fec7 0430bc7 3debf7a 801fec7 5e8e544 801fec7 8baeec8 3debf7a 33572bc 801fec7 0430bc7 8ce032d 57c0898 8ce032d 0430bc7 663a9c3 26a8369 663a9c3 26a8369 801fec7 26a8369 2efeef1 26a8369 1074fa0 26a8369 1074fa0 26a8369 8c8c1ee 26a8369 c3b8348 d4681be c3b8348 801fec7 c3b8348 26a8369 95cd28c 26a8369 3debf7a 26a8369 3debf7a 26a8369 9c2d729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 |
# Importing required libraries
import warnings
warnings.filterwarnings("ignore")
import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling
# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)
hf_hub_download(
repo_id="pszemraj/flan-t5-large-grammar-synthesis",
filename="ggml-model-Q6_K.gguf",
local_dir="./models",
)
# Set the title and description
title = "flan-t5-large-grammar-synthesis Llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python server support t5
[Model-Q6_K-GGUF](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis-gguf), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
"""
llama = None
import ctypes
import os
import multiprocessing
import llama_cpp
def respond(
message: str,
history: List[Tuple[str, str]],
model: str,
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repeat_penalty: float,
):
"""
Respond to a message using the Gemma3 model via Llama.cpp.
Args:
- message (str): The message to respond to.
- history (List[Tuple[str, str]]): The chat history.
- model (str): The model to use.
- system_message (str): The system message to use.
- max_tokens (int): The maximum number of tokens to generate.
- temperature (float): The temperature of the model.
- top_p (float): The top-p of the model.
- top_k (int): The top-k of the model.
- repeat_penalty (float): The repetition penalty of the model.
Returns:
str: The response to the message.
"""
if model == None:
return
try:
global llama
if llama == None:
model_id = "ggml-model-Q6_K.gguf"
llama = Llama(f"models/{model_id}",flash_attn=False,
n_gpu_layers=0,
n_ctx=max_tokens,
n_threads=2,
n_threads_batch=2,verbose=False)
tokens = llama.tokenize(f"{message}".encode("utf-8"))
llama.encode(tokens)
tokens = [llama.decoder_start_token()]
outputs =""
iteration = 1
for i in range(iteration):
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
outputs+= llama.detokenize([token]).decode()
yield outputs
if token == llama.token_eos():
break
#outputs+="\n"
return outputs
except Exception as e:
# Custom exception handling
raise CustomExceptionHandling(e, sys) from e
return None
# Create a chat interface
demo = gr.ChatInterface(
respond,
examples=[["What are the capital of France?"], ["What real child was raise by wolves?"], ["What am gravity?"]],
additional_inputs_accordion=gr.Accordion(
label="⚙️ Parameters", open=False, render=False
),
additional_inputs=[
gr.Dropdown(
choices=[
"ggml-model-Q6_K.gguf",
],
value="ggml-model-Q6_K.gguf",
label="Model",
info="Select the AI model to use for chat",
visible=False
),
gr.Textbox(
value="You are a helpful assistant.",
label="System Prompt",
info="Define the AI assistant's personality and behavior",
lines=2,visible=False
),
gr.Slider(
minimum=512,
maximum=512,
value=512,
step=1,
label="Max Tokens",
info="Maximum length of response (higher = longer replies)",visible=False
),
gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.4,
step=0.1,
label="Temperature",
info="Creativity level (higher = more creative, lower = more focused)",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
info="Nucleus sampling threshold",
),
gr.Slider(
minimum=1,
maximum=100,
value=40,
step=1,
label="Top-k",
info="Limit vocabulary choices to top K tokens",
),
gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition Penalty",
info="Penalize repeated words (higher = less repetition)",
),
],
theme="Ocean",
submit_btn="Send",
stop_btn="Stop",
title=title,
description=description,
chatbot=gr.Chatbot(scale=1, show_copy_button=True),
flagging_mode="never",
)
# Launch the chat interface
if __name__ == "__main__":
demo.launch(debug=False)
test()
|