import spaces import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from transformers import TextIteratorStreamer from threading import Thread import gradio as gr text_generator = None is_hugging_face = True model_id = "AXCXEPT/phi-4-deepseek-R1K-RL-EZO" model_id = "AXCXEPT/phi-4-open-R1-Distill-EZOv1" huggingface_token = os.getenv("HUGGINGFACE_TOKEN") huggingface_token = None device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu") device = "cuda" dtype = torch.bfloat16 dtype = torch.float16 if not huggingface_token: pass print("no HUGGINGFACE_TOKEN if you need set secret ") #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set") tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token) print(model_id,device,dtype) histories = [] #model = None if not is_hugging_face: model = AutoModelForCausalLM.from_pretrained( model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device ) text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device) if next(model.parameters()).is_cuda: print("The model is on a GPU") else: print("The model is on a CPU") #print(f"text_generator.device='{text_generator.device}") if str(text_generator.device).strip() == 'cuda': print("The pipeline is using a GPU") else: print("The pipeline is using a CPU") print("initialized") def generate_text(messages): if is_hugging_face:#need everytime initialize for ZeroGPU model = AutoModelForCausalLM.from_pretrained( model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device ) model.to(device) question = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) question = tokenizer(question, return_tensors="pt").to(device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True) generation_kwargs = dict(question, streamer=streamer, max_new_tokens=200) thread = Thread(target=model.generate, kwargs=generation_kwargs) generated_output = "" thread.start() for new_text in streamer: generated_output += new_text yield generated_output generate_text.zerogpu = True @spaces.GPU(duration=60) def call_generate_text(message, history): # history.append({"role": "user", "content": message}) #print(message) #print(history) messages = history+[{"role":"user","content":message}] try: for text in generate_text(messages): yield text except RuntimeError as e: print(f"An unexpected error occurred: {e}") yield "" demo = gr.ChatInterface(call_generate_text,type="messages") if __name__ == "__main__": demo.launch()