File size: 2,791 Bytes
1dd8d6e
376d532
 
 
43cc94e
376d532
1dd8d6e
9af0dbc
3043da5
ffe736b
af67c6c
937d944
13a6028
 
 
 
937d944
 
 
 
 
3043da5
0ee8fa9
 
3043da5
cd4d7e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43cc94e
0ee8fa9
cd4d7e3
 
 
 
 
 
 
 
 
 
 
 
1dd8d6e
af67c6c
1dd8d6e
4be0fb8
3043da5
 
 
43cc94e
28a3da2
376d532
a53bf4c
8ecfb8d
 
a53bf4c
 
8ecfb8d
1dd8d6e
d665e1b
040d697
9c8dc08
040d697
 
 
82325f6
1dd8d6e
8ecfb8d
 
 
1dd8d6e
 
8ecfb8d
dc37782
8fd5823
cd4d7e3
376d532
8fd5823
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from transformers import TextStreamer
import gradio as gr

text_generator = None
is_hugging_face = True
model_id = "google/gemma-2-9b-it"
model_id = "google/gemma-2-2b-it"
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cuda"
dtype = torch.bfloat16

if not huggingface_token:
    pass
    print("no HUGGINGFACE_TOKEN if you need set secret ")
    #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
        

    
    
    



tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)

print(model_id,device,dtype)
histories = []
#model = None



if not is_hugging_face:
    model = AutoModelForCausalLM.from_pretrained(
        model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
    )
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device,stream=True ) #pipeline has not to(device)
    
    if next(model.parameters()).is_cuda:
        print("The model is on a GPU")
    else:
        print("The model is on a CPU")

    #print(f"text_generator.device='{text_generator.device}")
    if str(text_generator.device).strip() == 'cuda':
        print("The pipeline is using a GPU")
    else:
        print("The pipeline is using a CPU")

print("initialized")

@spaces.GPU(duration=60)
def generate_text(messages):
    if is_hugging_face:#need everytime initialize for ZeroGPU
        model = AutoModelForCausalLM.from_pretrained(
                model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
            )
    streamer = TextStreamer(tokenizer, skip_prompt=True)    
    text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ,streamer=streamer,model_kwargs={"stream": True} ) #pipeline has not to(device)
    result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7)
    print(f"result={result}")
    generated_output = ""
    for token in result:
        print(f"token={token}")
        generated_output += token["text"]
        yield generated_output


def call_generate_text(message, history):
   # history.append({"role": "user", "content": message})
    print(message)
    print(history)
   
    messages = history+[{"role":"user","content":message}]
    try:
        
        for text in generate_text(messages):
            yield text
    except RuntimeError  as e:
        print(f"An unexpected error occurred: {e}")
        yield ""

demo = gr.ChatInterface(call_generate_text,type="messages")

if __name__ == "__main__":
    demo.launch(share=True)