Akjava's picture
Update app.py
0fb257c verified
raw
history blame
2.97 kB
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import TextIteratorStreamer
from threading import Thread
import gradio as gr
text_generator = None
is_hugging_face = True
model_id = "AXCXEPT/phi-4-deepseek-R1K-RL-EZO"
model_id = "AXCXEPT/phi-4-open-R1-Distill-EZOv1"
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
huggingface_token = None
device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda"
dtype = torch.bfloat16
dtype = torch.float16
if not huggingface_token:
pass
print("no HUGGINGFACE_TOKEN if you need set secret ")
#raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
print(model_id,device,dtype)
histories = []
#model = None
if not is_hugging_face:
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device)
if next(model.parameters()).is_cuda:
print("The model is on a GPU")
else:
print("The model is on a CPU")
#print(f"text_generator.device='{text_generator.device}")
if str(text_generator.device).strip() == 'cuda':
print("The pipeline is using a GPU")
else:
print("The pipeline is using a CPU")
print("initialized")
def generate_text(messages):
if is_hugging_face:#need everytime initialize for ZeroGPU
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
model.to(device)
question = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
question = tokenizer(question, return_tensors="pt").to(device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
generation_kwargs = dict(question, streamer=streamer, max_new_tokens=200)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
generated_output = ""
thread.start()
for new_text in streamer:
generated_output += new_text
yield generated_output
generate_text.zerogpu = True
@spaces.GPU(duration=60)
def call_generate_text(message, history):
# history.append({"role": "user", "content": message})
#print(message)
#print(history)
messages = history+[{"role":"user","content":message}]
try:
for text in generate_text(messages):
yield text
except RuntimeError as e:
print(f"An unexpected error occurred: {e}")
yield ""
demo = gr.ChatInterface(call_generate_text,type="messages")
if __name__ == "__main__":
demo.launch()