Smart_LLM

Running on Zero

File size: 6,768 Bytes

830eeaa
 
 
 
 
 
 
3738ef6
 
 
51a7d9e
d8a8bf1
51a7d9e
edb9e8a
51a7d9e
 
c00b625
51a7d9e
c44cbfe
3738ef6
 
 
b443e28
3738ef6
 
51a7d9e
 
 
 
3738ef6
 
 
 
 
 
 
51a7d9e
4b74382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
d8a8bf1
 
 
 
 
 
3738ef6
3bc2ef0
3738ef6
03e8281
3738ef6
bccdc56
d8a8bf1
3738ef6
659ca36
 
 
 
85dc104
3738ef6
 
 
 
c44cbfe
 
3738ef6
 
 
 
 
 
 
 
 
 
51a7d9e
3738ef6
 
 
 
51a7d9e
3738ef6
99a7a45
3738ef6
 
030c23d
3738ef6
edb9e8a
3738ef6
 
1c74333
3738ef6
 
659ca36
 
3738ef6
 
030c23d
51a7d9e
3738ef6
 
 
 
 
 
 
 
9a43acc
9eefdf9
3738ef6
 
51a7d9e
3738ef6
51a7d9e
 
 
 
 
 
 
 
3738ef6
c44cbfe
bc05e4d
c44cbfe
 
 
bc05e4d
 
c44cbfe
bc05e4d
c44cbfe
bc05e4d
c44cbfe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3738ef6
 
 
 
51a7d9e
 
 
 
5f09b8a
51a7d9e
 
 
 
3738ef6
c44cbfe
51a7d9e
b443e28
3738ef6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c44cbfe
3738ef6
51a7d9e
 
 
4f82bbf
 
b443e28
c44cbfe
b443e28
51a7d9e
 
 
3738ef6
51a7d9e
 
3738ef6

import subprocess
subprocess.run(
    'pip install flash-attn --no-build-isolation',
    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
    shell=True
)

import os
import time
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
import gradio as gr
from threading import Thread

HF_TOKEN = os.environ.get("HF_TOKEN", None)
MODEL = "Daemontatox/AetherDrake"

TITLE = "<h1><center>Sphinx Reasoner</center></h1>"

PLACEHOLDER = """
<center>
<p>Ask me Anything !!</p>
</center>
"""


CSS = """
.duplicate-button {
    margin: auto !important;
    color: white !important;
    background: black !important;
    border-radius: 100vh !important;
}
h3 {
    text-align: center;
}
.message-wrap {
    overflow-x: auto;
}
.message-wrap p {
    margin-bottom: 1em;
}
.message-wrap pre {
    background-color: #f6f8fa;
    border-radius: 3px;
    padding: 16px;
    overflow-x: auto;
}
.message-wrap code {
    background-color: rgba(175,184,193,0.2);
    border-radius: 3px;
    padding: 0.2em 0.4em;
    font-family: monospace;
}
"""
device = "cuda" # for GPU usage or "cpu" for CPU usage

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type= "nf4")

tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    torch_dtype=torch.float16,
    device_map="auto",
    attn_implementation="flash_attention_2",
    quantization_config=quantization_config)

# Ensure `pad_token_id` is set
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

@spaces.GPU()
def stream_chat(
    message: str, 
    history: list,
    system_prompt: str,
    temperature: float = 1.0, 
    max_new_tokens: int = 8192, 
    top_p: float = 1.0, 
    top_k: int = 20, 
    penalty: float = 1.2,
):
    print(f'message: {message}')
    print(f'history: {history}')

    conversation = [
        {"role": "system", "content": system_prompt}
    ]
    for prompt, answer in history:
        conversation.extend([
            {"role": "user", "content": prompt}, 
            {"role": "assistant", "content": answer},
        ])

    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device)
    
    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
    
    generate_kwargs = dict(
        input_ids=input_ids, 
        max_new_tokens = max_new_tokens,
        do_sample = False if temperature == 0 else True,
        top_p = top_p,
        top_k = top_k,
        eos_token_id = tokenizer.eos_token_id,
        pad_token_id = tokenizer.pad_token_id,
        temperature = temperature,
        repetition_penalty=penalty,
        streamer=streamer,
    )

    with torch.no_grad():
        thread = Thread(target=model.generate, kwargs=generate_kwargs)
        thread.start()
        
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        yield buffer

            
chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)

with gr.Blocks(css=CSS, theme="soft") as demo:
    gr.HTML(TITLE)
    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
    gr.ChatInterface(
        fn=stream_chat,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(
                value="""You are an AI expert at providing high-quality answers. Your process involves these steps:

1. Initial Thought: Use the <Thinking> tag to reason step-by-step and generate your best possible response to the following request: [User's Request Here].
Example:
<Thinking> Step 1: Understand the request. Step 2: Analyze potential solutions. Step 3: Choose the optimal response. </Thinking>


2. Self-Critique: Critically evaluate your initial response within <Critique> tags, focusing on:

Accuracy: Is it factually correct and verifiable?

Clarity: Is it easy to understand and free of ambiguity?

Completeness: Does it fully address the user's request?

Improvement: What specific aspects could be better?
Example:
<Critique> Accuracy: Verified. Clarity: Needs simplification. Completeness: Add examples. </Critique>



3. Revision: Based on your critique, use <Revising> tags to refine and improve your response.
Example:
<Revising> Adjusting for clarity and adding an example to improve understanding. </Revising>


4. Final Response: Present your revised answer clearly within <Final> tags.
Example:
<Final> This is the improved response. </Final>


5. Tag Innovation: If necessary, create and define new tags to better structure your reasoning or enhance clarity. Use them consistently.
Example:
<Definition> This tag defines a new term introduced in the response. </Definition>



Ensure every part of your thought process and output is properly enclosed in appropriate tags for clarity and organization.

""",
                label="System Prompt",
                lines=5,
                render=False,
            ),
            gr.Slider(
                minimum=0,
                maximum=1,
                step=0.1,
                value=0.5,
                label="Temperature",
                render=False,
            ),
            gr.Slider(
                minimum=128,
                maximum=32000,
                step=1,
                value= 8192,
                label="Max new tokens",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.1,
                value=1.0,
                label="top_p",
                render=False,
            ),
            gr.Slider(
                minimum=1,
                maximum=20,
                step=1,
                value=20,
                label="top_k",
                render=False,
            ),
            gr.Slider(
                minimum=0.0,
                maximum=2.0,
                step=0.1,
                value=1.2,
                label="Repetition penalty",
                render=False,
            ),
        ],
        examples=[
            ["What is meant by a Singularity? "],
            ["Explain the theory of Relativty"],
            ["Explain your thought process"],
            ["Explain how mamba2 structure LLMs work and how do they differ from transformers? "],
        ],
        cache_examples=False,
    )


if __name__ == "__main__":
    demo.launch()