Spaces:
Sleeping
Sleeping
# Importing required libraries | |
import warnings | |
warnings.filterwarnings("ignore") | |
import os | |
import json | |
import subprocess | |
import sys | |
from llama_cpp import Llama,llama_model_decoder_start_token | |
import gradio as gr | |
from huggingface_hub import hf_hub_download | |
from typing import List, Tuple | |
from logger import logging | |
from exception import CustomExceptionHandling | |
# Download gguf model files | |
huggingface_token = os.getenv("HUGGINGFACE_TOKEN") | |
os.makedirs("models",exist_ok=True) | |
hf_hub_download( | |
repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF", | |
filename="madlad400-3b-mt-q8_0.gguf", | |
local_dir="./models", | |
) | |
# Set the title and description | |
title = "madlad400-3b-mt llama.cpp" | |
description = """ | |
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python support t5 | |
[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp), [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a) | |
""" | |
llama = None | |
import ctypes | |
import os | |
import multiprocessing | |
import llama_cpp | |
def respond( | |
message: str, | |
history: List[Tuple[str, str]], | |
model: str, | |
system_message: str, | |
max_tokens: int, | |
temperature: float, | |
top_p: float, | |
top_k: int, | |
repeat_penalty: float, | |
): | |
""" | |
Respond to a message using the Gemma3 model via Llama.cpp. | |
Args: | |
- message (str): The message to respond to. | |
- history (List[Tuple[str, str]]): The chat history. | |
- model (str): The model to use. | |
- system_message (str): The system message to use. | |
- max_tokens (int): The maximum number of tokens to generate. | |
- temperature (float): The temperature of the model. | |
- top_p (float): The top-p of the model. | |
- top_k (int): The top-k of the model. | |
- repeat_penalty (float): The repetition penalty of the model. | |
Returns: | |
str: The response to the message. | |
""" | |
if model==None: | |
return | |
try: | |
global llama | |
if llama == None: | |
llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False, | |
n_gpu_layers=0, | |
n_batch=32, | |
n_ctx=512, | |
n_threads=2, | |
n_threads_batch=2,verbose=False) | |
tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8")) | |
llama.encode(tokens) | |
tokens = [llama.decoder_start_token()] | |
outputs ="" | |
for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty): | |
outputs+= llama.detokenize([token]).decode() | |
yield outputs | |
if token == llama.token_eos(): | |
break | |
return outputs | |
except Exception as e: | |
# Custom exception handling | |
raise CustomExceptionHandling(e, sys) from e | |
return None | |
# Create a chat interface | |
demo = gr.ChatInterface( | |
respond, | |
examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]], | |
additional_inputs_accordion=gr.Accordion( | |
label="⚙️ Parameters", open=False, render=False | |
), | |
additional_inputs=[ | |
gr.Dropdown( | |
choices=[ | |
"madlad400-3b-mt-q8_0.gguf", | |
], | |
value="madlad400-3b-mt-q8_0.gguf", | |
label="Model", | |
info="Select the AI model to use for chat", | |
), | |
gr.Textbox( | |
value="You are a helpful assistant.", | |
label="System Prompt", | |
info="Define the AI assistant's personality and behavior", | |
lines=2,visible=False | |
), | |
gr.Slider( | |
minimum=512, | |
maximum=2048, | |
value=1024, | |
step=1, | |
label="Max Tokens", | |
info="Maximum length of response (higher = longer replies)", | |
), | |
gr.Slider( | |
minimum=0.1, | |
maximum=2.0, | |
value=0.7, | |
step=0.1, | |
label="Temperature", | |
info="Creativity level (higher = more creative, lower = more focused)", | |
), | |
gr.Slider( | |
minimum=0.1, | |
maximum=1.0, | |
value=0.95, | |
step=0.05, | |
label="Top-p", | |
info="Nucleus sampling threshold", | |
), | |
gr.Slider( | |
minimum=1, | |
maximum=100, | |
value=40, | |
step=1, | |
label="Top-k", | |
info="Limit vocabulary choices to top K tokens", | |
), | |
gr.Slider( | |
minimum=1.0, | |
maximum=2.0, | |
value=1.1, | |
step=0.1, | |
label="Repetition Penalty", | |
info="Penalize repeated words (higher = less repetition)", | |
), | |
], | |
theme="Ocean", | |
submit_btn="Send", | |
stop_btn="Stop", | |
title=title, | |
description=description, | |
chatbot=gr.Chatbot(scale=1, show_copy_button=True), | |
flagging_mode="never", | |
) | |
# Launch the chat interface | |
if __name__ == "__main__": | |
demo.launch(debug=False) | |
test() | |