Spaces:
Sleeping
Sleeping
File size: 4,695 Bytes
ed09ad2 e43cdcf 6ab5056 e43cdcf c9319f3 cc29585 9d3365a e43cdcf d3e16e4 6ab5056 9d3365a 6ab5056 9d3365a 6ab5056 92dcf0e 6ab5056 9d3365a e43cdcf 9d3365a e43cdcf cc29585 92dcf0e daf235b e43cdcf 9d3365a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig, pipeline
from flask import Flask, request, jsonify
from threading import Thread
from typing import Iterator
import spaces
import torch
import os
app = Flask(__name__)
print("Hello welcome to Sema AI", flush=True) # Flush to ensure immediate output
@app.route("/")
def hello():
return "hello 🤗, Welcome to Sema AI Chat Service."
# Get Hugging Face credentials from environment variables
email = os.getenv('HF_EMAIL')
password = os.getenv('HF_PASS')
GEMMA_TOKEN = os.getenv("GEMMA_TOKEN")
#print(f"email is {email} and password is {password}", flush=True)
if not (email, password,GEMMA_TOKEN):
print("no dependacies", flush=True)
"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
model_id = "google/gemma-2-2b-it"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model.config.sliding_window = 4096
model.eval()
"""
tokenizer = AutoTokenizer.from_pretrained(model, token=GEMMA_TOKEN, device=device)
quantization_config = GPTQConfig(
bits=4,
group_size=128,
dataset="c4", # the original datasets used in GPTQ paper [‘wikitext2’,‘c4’,‘c4-new’,‘ptb’,‘ptb-new’]
desc_act=False,
tokenizer=tokenizer,
batch_size=1,
)
quantized=False
if quantized:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
token=GEMMA_TOKEN,
quantization_config=quantization_config,
device_map=device
)
else:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
token=GEMMA_TOKEN,
torch_dtype=torch.float16,
device_map=device
)
app_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer
)
@app.route("/generate_text", methods=["POST"])
def generate_Text():
data = request.json
prompt = data.get("prompt", "")
max_new_tokens = data.get("max_new_tokens", 1000)
do_sample = data.get("do_sample", True)
temperature = data.get("temperature", 0.1)
top_k = data.get("top_k", 50)
top_p = data.get("top_p", 0.95)
tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
prompt, tokenize=False, add_generation_prompt=True)
outputs = app_pipeline(
tokenized_prompt,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
temperature=temperature,
top_k=top_k,
top_p=top_p
)
return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})
if __name__ == "__main__":
app.run(debug=False, port=8888)
"""
# Flask route to handle incoming chat requests
@app.route('/chat', methods=['POST'])
def chat():
# Get JSON data from the POST request
data = request.json
prompt = data.get('prompt')
email = data.get('email')
password = data.get('password')
print(f"email 2 is {email} and password 2 is {password} and The user wants to Know: {prompt}", flush=True)
if not (password):
return jsonify({"error": "Missing password"}), 400
elif not (prompt):
return jsonify({"error": "Missing prompt"}), 400
elif not (email):
return jsonify({"error": "Missing email"}), 400
else:
return jsonify({"error": "Missing prompt, email, or password"}), 400
# Generate the response
response = generate_response(prompt, email, password)
# Return the response as JSON
return jsonify({"response": response})
# Function for generating LLM response
def generate_response(prompt_input, email, passwd):
# Hugging Face Login
sign = Login(email, passwd)
cookies = sign.login()
# Create ChatBot
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
# Simple dialogue structure
string_dialogue = "You are a helpful assistant."
string_dialogue += f"\n\nUser: {prompt_input}\n\nAssistant: "
# Generate and return the response
return chatbot.chat(string_dialogue)
if __name__ == '__main__':
app.run(debug=True)
""" |