File size: 4,695 Bytes
ed09ad2
e43cdcf
6ab5056
 
 
 
 
 
e43cdcf
 
c9319f3
cc29585
 
 
9d3365a
 
 
 
e43cdcf
d3e16e4
 
6ab5056
 
 
9d3365a
 
 
 
6ab5056
 
 
 
9d3365a
6ab5056
92dcf0e
6ab5056
 
 
 
 
 
 
 
9d3365a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e43cdcf
 
9d3365a
 
e43cdcf
 
 
 
 
 
 
 
 
cc29585
92dcf0e
daf235b
 
 
 
 
 
 
e43cdcf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d3365a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig, pipeline
from flask import Flask, request, jsonify

from threading import Thread
from typing import Iterator

import spaces
import torch
import os

app = Flask(__name__)

print("Hello welcome to Sema AI", flush=True)  # Flush to ensure immediate output

@app.route("/")
def hello():
    return "hello 🤗, Welcome to Sema AI Chat Service."
    
# Get Hugging Face credentials from environment variables
email = os.getenv('HF_EMAIL')
password = os.getenv('HF_PASS')
GEMMA_TOKEN = os.getenv("GEMMA_TOKEN")
#print(f"email is {email} and password is {password}", flush=True)

if not (email, password,GEMMA_TOKEN):
    print("no dependacies", flush=True)

"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

model_id = "google/gemma-2-2b-it"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.sliding_window = 4096
model.eval()
"""


tokenizer = AutoTokenizer.from_pretrained(model, token=GEMMA_TOKEN, device=device)

quantization_config = GPTQConfig(
     bits=4,
     group_size=128,
     dataset="c4", # the original datasets used in GPTQ paper [‘wikitext2’,‘c4’,‘c4-new’,‘ptb’,‘ptb-new’]
     desc_act=False,
     tokenizer=tokenizer,
     batch_size=1,
)
quantized=False
if quantized:
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
                                                 token=GEMMA_TOKEN,
                                                 quantization_config=quantization_config,
                                                 device_map=device
                                                 )
else:
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it", 
                                                token=GEMMA_TOKEN,
                                                torch_dtype=torch.float16,
                                                device_map=device
                                                )


app_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

@app.route("/generate_text", methods=["POST"])
def generate_Text():
    data = request.json
    prompt = data.get("prompt", "")
    max_new_tokens = data.get("max_new_tokens", 1000)
    do_sample = data.get("do_sample", True)
    temperature = data.get("temperature", 0.1)
    top_k = data.get("top_k", 50)
    top_p = data.get("top_p", 0.95)

    tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
        prompt, tokenize=False, add_generation_prompt=True)
    outputs = app_pipeline(
        tokenized_prompt,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )

    return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})


if __name__ == "__main__":
    app.run(debug=False, port=8888)

    

"""    
# Flask route to handle incoming chat requests
@app.route('/chat', methods=['POST'])
def chat():
    # Get JSON data from the POST request
    data = request.json
    prompt = data.get('prompt')
    email = data.get('email')
    password = data.get('password')

    print(f"email 2 is {email} and password 2 is {password} and The user wants to Know: {prompt}", flush=True)
    
    if not (password):
        return jsonify({"error": "Missing password"}), 400
    elif not (prompt):
        return jsonify({"error": "Missing prompt"}), 400
    elif not (email):
        return jsonify({"error": "Missing email"}), 400
    else:
        return jsonify({"error": "Missing prompt, email, or password"}), 400

    # Generate the response
    response = generate_response(prompt, email, password)
    
    # Return the response as JSON
    return jsonify({"response": response})

# Function for generating LLM response
def generate_response(prompt_input, email, passwd):
    # Hugging Face Login
    sign = Login(email, passwd)
    cookies = sign.login()
    # Create ChatBot                        
    chatbot = hugchat.ChatBot(cookies=cookies.get_dict())

    # Simple dialogue structure
    string_dialogue = "You are a helpful assistant."
    string_dialogue += f"\n\nUser: {prompt_input}\n\nAssistant: "

    # Generate and return the response
    return chatbot.chat(string_dialogue)

if __name__ == '__main__':
    app.run(debug=True)
"""