File size: 3,262 Bytes
2f4b832
e659cfe
3d92619
9f7cb9a
2b0dd1e
b3ca2da
aab0c47
9ca55ad
 
2f4b832
3d92619
9f7cb9a
3802faf
 
0ff1cd2
3d92619
9ca55ad
 
 
 
 
 
0ff1cd2
3802faf
 
 
 
e659cfe
9f7cb9a
 
 
 
 
e659cfe
0ff1cd2
3802faf
 
 
3d92619
 
0ff1cd2
3d92619
0cb4dc1
3802faf
 
3d92619
9ca55ad
3802faf
0ff1cd2
9ca55ad
3802faf
3d92619
9ca55ad
0ff1cd2
 
 
3d92619
0ff1cd2
 
 
 
b8261fb
0ff1cd2
9ca55ad
0ff1cd2
9ca55ad
0ff1cd2
 
 
0cb4dc1
 
 
 
 
0ff1cd2
0cb4dc1
0ff1cd2
0cb4dc1
 
 
 
0ff1cd2
0cb4dc1
 
 
3802faf
 
3d92619
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import gradio as gr
import spaces
from transformers import OlmoeForCausalLM, AutoTokenizer
import torch
import subprocess
import sys

# Force upgrade transformers to the latest version
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "transformers"])

model_name = "allenai/OLMoE-1B-7B-0924-Instruct"

# Wrap model loading in a try-except block to handle potential errors
try:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    model = OlmoeForCausalLM.from_pretrained(
        model_name, 
        trust_remote_code=True, 
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
        low_cpu_mem_usage=True,
        device_map="auto"
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)
except Exception as e:
    print(f"Error loading model: {e}")
    model = None
    tokenizer = None

system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
                 "who is stuck inside a step function machine and remembers and counts everything he says "
                 "while always answering questions in full first principles analysis type of thinking "
                 "without using any analogies and always showing full working code or output in his answers.")

@spaces.GPU
def generate_response(message, history, temperature, max_new_tokens):
    if model is None or tokenizer is None:
        return "Model or tokenizer not loaded properly. Please check the logs."
    
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": message}]
    
    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
    
    with torch.no_grad():
        generate_ids = model.generate(
            inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            eos_token_id=tokenizer.eos_token_id,
        )
    response = tokenizer.decode(generate_ids[0, inputs.shape[1]:], skip_special_tokens=True)
    return response.strip()

css = """
  #output {
    height: 900px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown("# Nisten's Karpathy Chatbot with OSS OLMoE")
    chatbot = gr.Chatbot(elem_id="output")
    msg = gr.Textbox(label="Your message")
    with gr.Row():
        temperature = gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
        max_new_tokens = gr.Slider(minimum=50, maximum=4000, value=1000, step=50, label="Max New Tokens")
    clear = gr.Button("Clear")

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(history, temp, max_tokens):
        user_message = history[-1][0]
        bot_message = generate_response(user_message, history, temp, max_tokens)
        history[-1][1] = bot_message
        return history

    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
        bot, [chatbot, temperature, max_new_tokens], chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.queue(api_open=False)
    demo.launch(debug=True, show_api=True, share=True)