File size: 5,231 Bytes
26a8369 eefa060 26a8369 ac18c71 26a8369 c3b8348 358544d 801fec7 26a8369 358544d 26a8369 801fec7 fc47a3f 801fec7 e594c8e 4ff0368 e594c8e 801fec7 4ff0368 801fec7 26a8369 9c2d729 26a8369 c73126b 26a8369 801fec7 a06586a 801fec7 c73126b 801fec7 663a9c3 26a8369 663a9c3 26a8369 801fec7 26a8369 c62145d 26a8369 358544d 26a8369 358544d 26a8369 c3b8348 d4681be c3b8348 801fec7 c3b8348 26a8369 9c2d729 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
# Importing required libraries
import warnings
warnings.filterwarnings("ignore")
import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling
# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)
hf_hub_download(
repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
filename="madlad400-3b-mt-q8_0.gguf",
local_dir="./models",
)
# Set the title and description
title = "madlad400-3b-mt llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python support t5
[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp), [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
"""
llama = None
import ctypes
import os
import multiprocessing
import llama_cpp
def respond(
message: str,
history: List[Tuple[str, str]],
model: str,
system_message: str,
max_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repeat_penalty: float,
):
"""
Respond to a message using the Gemma3 model via Llama.cpp.
Args:
- message (str): The message to respond to.
- history (List[Tuple[str, str]]): The chat history.
- model (str): The model to use.
- system_message (str): The system message to use.
- max_tokens (int): The maximum number of tokens to generate.
- temperature (float): The temperature of the model.
- top_p (float): The top-p of the model.
- top_k (int): The top-k of the model.
- repeat_penalty (float): The repetition penalty of the model.
Returns:
str: The response to the message.
"""
if model==None:
return
try:
global llama
if llama == None:
llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
n_gpu_layers=0,
n_batch=32,
n_ctx=512,
n_threads=2,
n_threads_batch=2,verbose=False)
tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
llama.encode(tokens)
tokens = [llama.decoder_start_token()]
outputs =""
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
outputs+= llama.detokenize([token]).decode()
yield outputs
if token == llama.token_eos():
break
return outputs
except Exception as e:
# Custom exception handling
raise CustomExceptionHandling(e, sys) from e
return None
# Create a chat interface
demo = gr.ChatInterface(
respond,
examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
additional_inputs_accordion=gr.Accordion(
label="⚙️ Parameters", open=False, render=False
),
additional_inputs=[
gr.Dropdown(
choices=[
"madlad400-3b-mt-q8_0.gguf",
],
value="madlad400-3b-mt-q8_0.gguf",
label="Model",
info="Select the AI model to use for chat",
),
gr.Textbox(
value="You are a helpful assistant.",
label="System Prompt",
info="Define the AI assistant's personality and behavior",
lines=2,visible=False
),
gr.Slider(
minimum=512,
maximum=2048,
value=1024,
step=1,
label="Max Tokens",
info="Maximum length of response (higher = longer replies)",
),
gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Creativity level (higher = more creative, lower = more focused)",
),
gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.95,
step=0.05,
label="Top-p",
info="Nucleus sampling threshold",
),
gr.Slider(
minimum=1,
maximum=100,
value=40,
step=1,
label="Top-k",
info="Limit vocabulary choices to top K tokens",
),
gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.1,
step=0.1,
label="Repetition Penalty",
info="Penalize repeated words (higher = less repetition)",
),
],
theme="Ocean",
submit_btn="Send",
stop_btn="Stop",
title=title,
description=description,
chatbot=gr.Chatbot(scale=1, show_copy_button=True),
flagging_mode="never",
)
# Launch the chat interface
if __name__ == "__main__":
demo.launch(debug=False)
test()
|