Akjava's picture
Update app.py
a66261e verified
# Importing required libraries
import warnings
warnings.filterwarnings("ignore")
import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.chat_history.messages import Roles
from llama_cpp_agent.messages_formatter import MessagesFormatter, PromptMarkers
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling
# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)
hf_hub_download(
repo_id="pszemraj/flan-t5-large-grammar-synthesis",
filename="ggml-model-Q6_K.gguf",
local_dir="./models",
)
# Set the title and description
title = "flan-t5-large-grammar-synthesis Llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python server support t5
[Model-Q6_K-GGUF](https://huggingface.co/pszemraj/flan-t5-large-grammar-synthesis-gguf), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp)
"""
llama = None
import ctypes
import os
import multiprocessing
import llama_cpp
def respond(
message: str,
history: List[Tuple[str, str]],
model: str,
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repeat_penalty: float,
):
"""
Respond to a message using the Gemma3 model via Llama.cpp.
Args:
- message (str): The message to respond to.
- history (List[Tuple[str, str]]): The chat history.
- model (str): The model to use.
- system_message (str): The system message to use.
- max_tokens (int): The maximum number of tokens to generate.
- temperature (float): The temperature of the model.
- top_p (float): The top-p of the model.
- top_k (int): The top-k of the model.
- repeat_penalty (float): The repetition penalty of the model.
Returns:
str: The response to the message.
"""
if model == None:
return
try:
global llama
if llama == None:
model_id = "ggml-model-Q6_K.gguf"
llama = Llama(f"models/{model_id}",flash_attn=False,
n_gpu_layers=0,
n_ctx=max_tokens,
n_threads=2,
n_threads_batch=2,verbose=False)
tokens = llama.tokenize(f"{message}".encode("utf-8"))
llama.encode(tokens)
tokens = [llama.decoder_start_token()]
outputs =""
iteration = 1
for i in range(iteration):
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
outputs+= llama.detokenize([token]).decode()
yield outputs
if token == llama.token_eos():
break
#outputs+="\n"
return outputs
except Exception as e:
# Custom exception handling
raise CustomExceptionHandling(e, sys) from e
return None
# Create a chat interface
demo = gr.ChatInterface(
respond,
examples=[["What are the capital of France?"], ["What real child was raise by wolves?"], ["What am gravity?"]],
additional_inputs_accordion=gr.Accordion(
label="⚙️ Parameters", open=False, render=False
),
additional_inputs=[
gr.Dropdown(
choices=[
"ggml-model-Q6_K.gguf",
],
value="ggml-model-Q6_K.gguf",
label="Model",
info="Select the AI model to use for chat",
visible=False
),
gr.Textbox(
value="You are a helpful assistant.",
label="System Prompt",
info="Define the AI assistant's personality and behavior",
lines=2,visible=False
),
gr.Slider(
minimum=512,
maximum=512,
value=512,
step=1,
label="Max Tokens",
info="Maximum length of response (higher = longer replies)",visible=False
),
gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.4,
step=0.1,
label="Temperature",
info="Creativity level (higher = more creative, lower = more focused)",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
info="Nucleus sampling threshold",
),
gr.Slider(
minimum=1,
maximum=100,
value=40,
step=1,
label="Top-k",
info="Limit vocabulary choices to top K tokens",
),
gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition Penalty",
info="Penalize repeated words (higher = less repetition)",
),
],
theme="Ocean",
submit_btn="Send",
stop_btn="Stop",
title=title,
description=description,
chatbot=gr.Chatbot(scale=1, show_copy_button=True),
flagging_mode="never",
)
# Launch the chat interface
if __name__ == "__main__":
demo.launch(debug=False)
test()