llamaSMS / main.py
Tri4's picture
Update main.py
ed09ad2 verified
raw
history blame
4.7 kB
from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig, pipeline
from flask import Flask, request, jsonify
from threading import Thread
from typing import Iterator
import spaces
import torch
import os
app = Flask(__name__)
print("Hello welcome to Sema AI", flush=True) # Flush to ensure immediate output
@app.route("/")
def hello():
return "hello 🤗, Welcome to Sema AI Chat Service."
# Get Hugging Face credentials from environment variables
email = os.getenv('HF_EMAIL')
password = os.getenv('HF_PASS')
GEMMA_TOKEN = os.getenv("GEMMA_TOKEN")
#print(f"email is {email} and password is {password}", flush=True)
if not (email, password,GEMMA_TOKEN):
print("no dependacies", flush=True)
"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
model_id = "google/gemma-2-2b-it"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model.config.sliding_window = 4096
model.eval()
"""
tokenizer = AutoTokenizer.from_pretrained(model, token=GEMMA_TOKEN, device=device)
quantization_config = GPTQConfig(
bits=4,
group_size=128,
dataset="c4", # the original datasets used in GPTQ paper [‘wikitext2’,‘c4’,‘c4-new’,‘ptb’,‘ptb-new’]
desc_act=False,
tokenizer=tokenizer,
batch_size=1,
)
quantized=False
if quantized:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
token=GEMMA_TOKEN,
quantization_config=quantization_config,
device_map=device
)
else:
model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
token=GEMMA_TOKEN,
torch_dtype=torch.float16,
device_map=device
)
app_pipeline = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer
)
@app.route("/generate_text", methods=["POST"])
def generate_Text():
data = request.json
prompt = data.get("prompt", "")
max_new_tokens = data.get("max_new_tokens", 1000)
do_sample = data.get("do_sample", True)
temperature = data.get("temperature", 0.1)
top_k = data.get("top_k", 50)
top_p = data.get("top_p", 0.95)
tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
prompt, tokenize=False, add_generation_prompt=True)
outputs = app_pipeline(
tokenized_prompt,
max_new_tokens=max_new_tokens,
do_sample=do_sample,
temperature=temperature,
top_k=top_k,
top_p=top_p
)
return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})
if __name__ == "__main__":
app.run(debug=False, port=8888)
"""
# Flask route to handle incoming chat requests
@app.route('/chat', methods=['POST'])
def chat():
# Get JSON data from the POST request
data = request.json
prompt = data.get('prompt')
email = data.get('email')
password = data.get('password')
print(f"email 2 is {email} and password 2 is {password} and The user wants to Know: {prompt}", flush=True)
if not (password):
return jsonify({"error": "Missing password"}), 400
elif not (prompt):
return jsonify({"error": "Missing prompt"}), 400
elif not (email):
return jsonify({"error": "Missing email"}), 400
else:
return jsonify({"error": "Missing prompt, email, or password"}), 400
# Generate the response
response = generate_response(prompt, email, password)
# Return the response as JSON
return jsonify({"response": response})
# Function for generating LLM response
def generate_response(prompt_input, email, passwd):
# Hugging Face Login
sign = Login(email, passwd)
cookies = sign.login()
# Create ChatBot
chatbot = hugchat.ChatBot(cookies=cookies.get_dict())
# Simple dialogue structure
string_dialogue = "You are a helpful assistant."
string_dialogue += f"\n\nUser: {prompt_input}\n\nAssistant: "
# Generate and return the response
return chatbot.chat(string_dialogue)
if __name__ == '__main__':
app.run(debug=True)
"""