allenai-OLMoE-1B-7B-0924-cpu

Build error

File size: 2,470 Bytes

2f4b832
9f7cb9a
2b0dd1e
 
2f4b832
2b0dd1e
 
9f7cb9a
2b0dd1e
0cb4dc1
2b0dd1e
 
9f7cb9a
2b0dd1e
9f7cb9a
 
 
 
 
2b0dd1e
 
 
 
 
 
0cb4dc1
2b0dd1e
 
 
 
 
 
 
 
 
 
 
 
b8261fb
2b0dd1e
b8261fb
2b0dd1e
0cb4dc1
 
 
 
 
 
 
 
 
2b0dd1e
0cb4dc1
 
 
 
 
 
 
 
2b0dd1e

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import subprocess

# Install flash attention
subprocess.run('pip install --upgrade --force-reinstall --no-deps --no-build-isolation transformers torch flash-attn  ', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

# Load model and tokenizer
model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype="auto", _attn_implementation="flash_attention_2").cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Define prompts
system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
                 "who is stuck inside a step function machine and remembers and counts everything he says "
                 "while always answering questions in full first principles analysis type of thinking "
                 "without using any analogies and always showing full working code or output in his answers.")

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

def generate_response(message, history):
    full_prompt = f"{system_prompt}\n{user_prompt}{message}{prompt_suffix}{assistant_prompt}"
    
    inputs = tokenizer(full_prompt, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        do_sample=True,
        temperature=0.7,
        eos_token_id=tokenizer.eos_token_id,
    )
    response = tokenizer.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:], 
                                      skip_special_tokens=True, 
                                      clean_up_tokenization_spaces=False)[0]
    return response.strip()

# Set up Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Pissed Off Karpathy Chatbot")
    chatbot = gr.Chatbot()
    msg = gr.Textbox()
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history):
        user_message = history[-1][0]
        bot_message = generate_response(user_message, history)
        history[-1][1] = bot_message
        return history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

demo.queue()
demo.launch(debug=True, share=True)