llamacpp-madlad400-3b-mt-2jp

Sleeping

File size: 5,231 Bytes

# Importing required libraries
import warnings
warnings.filterwarnings("ignore")

import os
import json
import subprocess
import sys
from llama_cpp import Llama,llama_model_decoder_start_token
import gradio as gr
from huggingface_hub import hf_hub_download
from typing import List, Tuple
from logger import logging
from exception import CustomExceptionHandling


# Download gguf model files
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
os.makedirs("models",exist_ok=True)

hf_hub_download(
    repo_id="mtsdurica/madlad400-3b-mt-Q8_0-GGUF",
    filename="madlad400-3b-mt-q8_0.gguf",
    local_dir="./models",
)

# Set the title and description
title = "madlad400-3b-mt llama.cpp"
description = """
I'm using [fairydreaming/T5-branch](https://github.com/fairydreaming/llama-cpp-python/tree/t5), I'm not sure current llama-cpp-python support t5

[Model-Q8_0-GGUF](https://huggingface.co/mtsdurica/madlad400-3b-mt-Q8_0-GGUF), [Reference1](https://huggingface.co/spaces/sitammeur/Gemma-llamacpp), [Reference2](https://qiita.com/mbotsu/items/7dd80bc637ff6c12ef6a)
"""


llama = None


import ctypes
import os
import multiprocessing

import llama_cpp

def respond(
    message: str,
    history: List[Tuple[str, str]],
    model: str,
    system_message: str,
    max_tokens: int,
    temperature: float,
    top_p: float,
    top_k: int,
    repeat_penalty: float,
):
    """
    Respond to a message using the Gemma3 model via Llama.cpp.

    Args:
        - message (str): The message to respond to.
        - history (List[Tuple[str, str]]): The chat history.
        - model (str): The model to use.
        - system_message (str): The system message to use.
        - max_tokens (int): The maximum number of tokens to generate.
        - temperature (float): The temperature of the model.
        - top_p (float): The top-p of the model.
        - top_k (int): The top-k of the model.
        - repeat_penalty (float): The repetition penalty of the model.

    Returns:
        str: The response to the message.
    """
    if model==None:
        return
    try:
        global llama
        if llama == None:
            llama = Llama("models/madlad400-3b-mt-q8_0.gguf",flash_attn=False,
                        n_gpu_layers=0,
                        n_batch=32,
                        n_ctx=512,
                        n_threads=2,
                        n_threads_batch=2,verbose=False)
        
        tokens = llama.tokenize(f"<2ja>{message}".encode("utf-8"))
        llama.encode(tokens)
        tokens = [llama.decoder_start_token()]
        outputs =""
        for token in llama.generate(tokens, top_k=top_k, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
            outputs+= llama.detokenize([token]).decode()
            yield outputs
            if token == llama.token_eos():
                break
        return outputs
    except Exception as e:
        # Custom exception handling
        raise CustomExceptionHandling(e, sys) from e
    return None

            

    
    


# Create a chat interface
demo = gr.ChatInterface(
    respond,
    examples=[["What is the capital of France?"], ["Tell me something about artificial intelligence."], ["What is gravity?"]],
    additional_inputs_accordion=gr.Accordion(
        label="⚙️ Parameters", open=False, render=False
    ),
    additional_inputs=[
        gr.Dropdown(
            choices=[
                "madlad400-3b-mt-q8_0.gguf",
            ],
            value="madlad400-3b-mt-q8_0.gguf",
            label="Model",
            info="Select the AI model to use for chat",
        ),
        gr.Textbox(
            value="You are a helpful assistant.",
            label="System Prompt",
            info="Define the AI assistant's personality and behavior",
            lines=2,visible=False
        ),
        gr.Slider(
            minimum=512,
            maximum=2048,
            value=1024,
            step=1,
            label="Max Tokens",
            info="Maximum length of response (higher = longer replies)",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.7,
            step=0.1,
            label="Temperature",
            info="Creativity level (higher = more creative, lower = more focused)",
        ),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
            info="Nucleus sampling threshold",
        ),
        gr.Slider(
            minimum=1,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
            info="Limit vocabulary choices to top K tokens",
        ),
        gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition Penalty",
            info="Penalize repeated words (higher = less repetition)",
        ),
    ],
    theme="Ocean",
    submit_btn="Send",
    stop_btn="Stop",
    title=title,
    description=description,
    chatbot=gr.Chatbot(scale=1, show_copy_button=True),
    flagging_mode="never",
)


# Launch the chat interface
if __name__ == "__main__":
    demo.launch(debug=False)
    test()