File size: 3,039 Bytes
1dd8d6e
 
 
376d532
 
 
 
1dd8d6e
9af0dbc
3043da5
0ee8fa9
ff4180a
0ee8fa9
 
 
 
 
 
 
2281ee6
0ee8fa9
 
ef229c7
0ee8fa9
 
 
 
 
 
 
3043da5
0ee8fa9
 
3043da5
818b261
3043da5
 
 
 
 
 
 
 
 
 
 
 
 
 
0ee8fa9
 
1dd8d6e
 
 
4be0fb8
3043da5
 
 
 
376d532
 
 
 
 
 
1dd8d6e
 
 
376d532
 
 
 
1dd8d6e
d665e1b
040d697
9c8dc08
040d697
 
 
82325f6
1dd8d6e
82325f6
1dd8d6e
 
 
 
 
dc37782
8fd5823
dc37782
376d532
0ee8fa9
8fd5823
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94


import spaces
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

text_generator = None
is_hugging_face = True
def init():
    global text_generator
    huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
    if not huggingface_token:
        pass
        print("no HUGGINGFACE_TOKEN if you need set secret ")
        #raise ValueError("HUGGINGFACE_TOKEN environment variable is not set")
    
    model_id = "google/gemma-2-9b-it"
    #model_id = "microsoft/Phi-3-mini-128k-instruct"
    
    device = "auto" # torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cuda"
    dtype = torch.bfloat16
    
    tokenizer = AutoTokenizer.from_pretrained(model_id, token=huggingface_token)
    
    print(model_id,device,dtype)
    histories = []
    #model = None

    
    
    if not is_hugging_face:
        model = AutoModelForCausalLM.from_pretrained(
            model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
        )
        text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
        
        if next(model.parameters()).is_cuda:
            print("The model is on a GPU")
        else:
            print("The model is on a CPU")
    
        #print(f"text_generator.device='{text_generator.device}")
        if str(text_generator.device).strip() == 'cuda':
            print("The pipeline is using a GPU")
        else:
            print("The pipeline is using a CPU")
    
    print("initialized")

@spaces.GPU(duration=120)
def generate_text(messages):
    if is_hugging_face:#need everytime initialize for ZeroGPU
        model = AutoModelForCausalLM.from_pretrained(
                model_id, token=huggingface_token ,torch_dtype=dtype,device_map=device
            )
        text_generator = pipeline("text-generation", model=model, tokenizer=tokenizer,torch_dtype=dtype,device_map=device ) #pipeline has not to(device)
    result = text_generator(messages, max_new_tokens=256, do_sample=True, temperature=0.7)

    generated_output = result[0]["generated_text"]
    if isinstance(generated_output, list):
        for message in reversed(generated_output):
            if message.get("role") == "assistant":
                content= message.get("content", "No content found.")
                return content
            
        return "No assistant response found."
    else:
        return "Unexpected output format."



def call_generate_text(message, history):
   # history.append({"role": "user", "content": message})
    print(message)
    print(history)
   
    messages = history+[{"role":"user","content":message}]
    try:
        text = generate_text(messages)
        return text
    except RuntimeError  as e:
        print(f"An unexpected error occurred: {e}")
       
    return ""

demo = gr.ChatInterface(call_generate_text,type="messages")

if __name__ == "__main__":
    init()
    demo.launch(share=True)