Spaces:
Runtime error
Runtime error
File size: 2,693 Bytes
1dd8d6e 376d532 43cc94e 376d532 1dd8d6e 9af0dbc 3043da5 ffe736b 937d944 13a6028 937d944 3043da5 0ee8fa9 3043da5 cd4d7e3 43cc94e 0ee8fa9 cd4d7e3 1dd8d6e 786d1a9 1dd8d6e 4be0fb8 3043da5 43cc94e 376d532 43cc94e 8ecfb8d 1dd8d6e d665e1b 040d697 9c8dc08 040d697 82325f6 1dd8d6e 8ecfb8d 1dd8d6e 8ecfb8d dc37782 8fd5823 cd4d7e3 376d532 8fd5823 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import TextStreamer
import gradio as gr
text_generator = None
is_hugging_face = True
model_id = "google/gemma-2-9b-it"
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda"
dtype = torch.bfloat16
if not huggingface_token:
pass
print("no HUGGINGFACE_TOKEN if you need set secret ")
#raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
print(model_id,device,dtype)
histories = []
#model = None
if not is_hugging_face:
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device)
if next(model.parameters()).is_cuda:
print("The model is on a GPU")
else:
print("The model is on a CPU")
#print(f"text_generator.device='{text_generator.device}")
if str(text_generator.device).strip() == 'cuda':
print("The pipeline is using a GPU")
else:
print("The pipeline is using a CPU")
print("initialized")
@spaces.GPU(duration=30)
def generate_text(messages):
if is_hugging_face:#need everytime initialize for ZeroGPU
model = AutoModelForCausalLM.from_pretrained(
model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
)
streamer = TextStreamer(tokenizer, skip_prompt=True)
text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ,streamer=streamer) #pipeline has not to(device)
result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7)
print(result)
generated_output = ""
for token in result:
generated_output += token["generated_token"]
yield generated_output
def call_generate_text(message, history):
# history.append({"role": "user", "content": message})
print(message)
print(history)
messages = history+[{"role":"user","content":message}]
try:
for text in generate_text(messages):
yield text
except RuntimeError as e:
print(f"An unexpected error occurred: {e}")
yield ""
demo = gr.ChatInterface(call_generate_text,type="messages")
if __name__ == "__main__":
demo.launch(share=True)
|