Napoleon

Runtime error

File size: 5,337 Bytes

5416372
a2e6c05
 
2cd2fd0
 
 
 
 
 
d81ed7c
a2e6c05
0887657
d81ed7c
87631e8
 
df67f78
928f478
da37e26
17a0b66
a2e6c05
c91002b
d81ed7c
 
 
edb02ce
d81ed7c
 
 
e976361
 
87631e8
 
 
 
 
 
 
 
 
 
 
 
 
a2e6c05
d81ed7c
a2e6c05
 
e16ac22
 
a2e6c05
 
 
 
 
e976361
 
 
a2e6c05
e976361
a2e6c05
d81ed7c
a2e6c05
d81ed7c
a2e6c05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d81ed7c
 
 
 
73bd8c6
df67f78
06b791f
eb49280
 
e976361
43f7448
d81ed7c
 
 
 
 
e976361
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d81ed7c
 
b23a519
 
 
4ca13f0
18e5a55
72fd759
18e5a55
aea3d83
3b39700
014d21e
 
a2e6c05
 
f15ce9b
a2e6c05
 
 
 
1414338
bbb4a8b
 
 
 
437f34e
d550b18
 
 
 
d81ed7c
 
 
a2e6c05

import spaces
import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent
from llama_cpp_agent import MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
from llama_cpp_agent.chat_history import BasicChatHistory
from llama_cpp_agent.chat_history.messages import Roles
import gradio as gr
from huggingface_hub import hf_hub_download
from ui import css, PLACEHOLDER

llm = None
llm_model = None
hf_hub_download(repo_id="baconnier/Napoleon_24B_V0.2-Q8_0-GGUF", filename="napoleon_24b_v0.2-q8_0.gguf",  local_dir = "./models")
hf_hub_download(repo_id="baconnier/Napoleon_24B_V0.1-Q8_0-GGUF", filename="napoleon_24b_v0.1-q8_0.gguf",  local_dir = "./models")
hf_hub_download(repo_id="baconnier/Napoleon_24B_V0.0-GGUF", filename="Napoleon_24B_V0.0.Q8_0.gguf",  local_dir = "./models")


@spaces.GPU(duration=60)
def respond(
    message,
    history: list[tuple[str, str]],
    model,
    max_tokens,
    temperature,
    top_p,
    top_k,
    repeat_penalty,
):
    global llm
    global llm_model

    if llm is None or llm_model != model:
        llm = Llama(
            model_path=f"models/{model}",
            flash_attn=True,
            n_gpu_layers=81,
            n_batch=1024,
            n_ctx=8192,
        )
        llm_model=model
    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        #system_prompt="You are Dolphin, an AI assistant that helps humanity, trained to specialize in reasoning and first-principles analysis. When responding, always format your replies using <think>{reasoning}</think>{answer}. Use at least 6 reasoning steps and perform a root cause analysis before answering. However, if the answer is very easy and requires little thought, you may leave the <think></think> block empty. Your responses should be detailed, structured with rich Markdown formatting, and engaging with emojis. Be extensive in your explanations, just as the greatest scientific minds would be. Always reason through the problem first, unless it's trivial, in which case you may answer directly.",
        system_prompt="Tu es Napoleon et ne reponds qu'en francais.",
        predefined_messages_formatter_type=MessagesFormatterType.CHATML,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.top_k = top_k
    settings.top_p = top_p
    settings.max_tokens = max_tokens
    settings.repeat_penalty = repeat_penalty
    settings.stream = True

    messages = BasicChatHistory()

    for msn in history:
        user = {
            'role': Roles.user,
            'content': msn[0]
        }
        assistant = {
            'role': Roles.assistant,
            'content': msn[1]
        }
        messages.add_message(user)
        messages.add_message(assistant)
    
    stream = agent.get_chat_response(message, llm_sampling_settings=settings, chat_history=messages, returns_streaming_generator=True, print_output=False)
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

demo = gr.ChatInterface(
    respond,
    additional_inputs=[
        gr.Dropdown([
            'napoleon_24b_v0.2-q8_0.gguf',
            'napoleon_24b_v0.1-q8_0.gguf', 
            'Napoleon_24B_V0.0.Q8_0.gguf',            
        ], value="Napoleon_24B_V0.0.Q8_0.gguf", label="Model"),
        gr.Slider(minimum=1, maximum=8192, value=8192, step=1, label="Max tokens"),
        gr.Slider(minimum=0.05, maximum=4.0, value=0.6, step=0.1, label="Temperature"),
        gr.Slider(
            minimum=0.1,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top-p",
        ),
        gr.Slider(
            minimum=0,
            maximum=100,
            value=40,
            step=1,
            label="Top-k",
        ),
        gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.1,
            step=0.1,
            label="Repetition penalty",
        ),
    ],
    theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="blue", neutral_hue="gray",font=[gr.themes.GoogleFont("Exo"), "ui-sans-serif", "system-ui", "sans-serif"]).set(
        body_background_fill_dark="#0f172a",
        block_background_fill_dark="#0f172a",
        block_border_width="2px",
        block_title_background_fill_dark="#070d1b",
        input_background_fill_dark="#0c1425",
        button_secondary_background_fill_dark="#070d1b",
        border_color_accent_dark="#21293b",
        border_color_primary_dark="#21293b",
        background_fill_secondary_dark="#0f172a",
        color_accent_soft_dark="transparent"
    ),
    css=css,
    title="🇫🇷 Napoléon 🇫🇷",
    retry_btn="Retry",
    undo_btn="Undo",
    clear_btn="Clear",
    submit_btn="Send",
    description=f"This is Napoleon model, a French 24B LLM fine tune from Mistral AI, merged with Dolphin AI.",
    chatbot=gr.Chatbot(
        scale=1,
        placeholder=PLACEHOLDER,
        show_copy_button=True
    ),
    examples=[
        ['Pourquoi les serveurs parisiens sont-ils si "charmants" avec les touristes ?'],
        ['Est-il vrai que les Français font la grève plus souvent qu ils ne travaillent ?'],
        ],
)

if __name__ == "__main__":
    demo.launch()