File size: 2,987 Bytes
8aaa2cd
0fb257c
 
 
 
8aaa2cd
 
0fb257c
8aaa2cd
0fb257c
 
 
906f0c4
0fb257c
 
906f0c4
0fb257c
 
 
906f0c4
0fb257c
 
 
 
 
 
8aaa2cd
0fb257c
 
 
8aaa2cd
 
 
0fb257c
8aaa2cd
0fb257c
 
 
8aaa2cd
 
 
0fb257c
 
 
 
 
8aaa2cd
0fb257c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aaa2cd
0fb257c
 
 
 
 
16126cc
8aaa2cd
0fb257c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aaa2cd
0fb257c
 
 
16126cc
0fb257c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import TextIteratorStreamer
from threading import Thread

import gradio as gr

text_generator = None
is_hugging_face = True
model_id = "AXCXEPT/phi-4-deepseek-R1K-RL-EZO"
#model_id = "AXCXEPT/phi-4-open-R1-Distill-EZOv1"

huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
#huggingface_token = None
device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda"
dtype = torch.bfloat16
#dtype = torch.float16

if not huggingface_token:
    pass
    print("no HUGGINGFACE_TOKEN if you need set secret ")
    #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
        

    
    
    



tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)

print(model_id,device,dtype)
histories = []
#model = None



if not is_hugging_face:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
    )
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device)
    
    if next(model.parameters()).is_cuda:
        print("The model is on a GPU")
    else:
        print("The model is on a CPU")

    #print(f"text_generator.device='{text_generator.device}")
    if str(text_generator.device).strip() == 'cuda':
        print("The pipeline is using a GPU")
    else:
        print("The pipeline is using a CPU")

print("initialized")


def generate_text(messages):
    if is_hugging_face:#need everytime initialize for ZeroGPU
        model = AutoModelForCausalLM.from_pretrained(
                model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
            )
        model.to(device)
    question = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    question = tokenizer(question, return_tensors="pt").to(device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
    generation_kwargs = dict(question, streamer=streamer, max_new_tokens=200)
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    
    generated_output = ""
    thread.start()
    for new_text in streamer:
        generated_output += new_text
        yield generated_output            
#generate_text.zerogpu = True
    
       

@spaces.GPU(duration=60)
def call_generate_text(message, history):
   # history.append({"role": "user", "content": message})
    #print(message)
    #print(history)
   
    messages = history+[{"role":"user","content":message}]
    try:
        
        for text in generate_text(messages):
            yield text
    except RuntimeError  as e:
        print(f"An unexpected error occurred: {e}")
        yield ""

demo = gr.ChatInterface(call_generate_text,type="messages")

if __name__ == "__main__":
    demo.queue()
    demo.launch()