Spaces:

jikoni
/

llamaSMS

Sleeping

File size: 4,695 Bytes

from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig, pipeline
from flask import Flask, request, jsonify

from threading import Thread
from typing import Iterator

import spaces
import torch
import os

app = Flask(__name__)

print("Hello welcome to Sema AI", flush=True)  # Flush to ensure immediate output

@app.route("/")
def hello():
    return "hello 🤗, Welcome to Sema AI Chat Service."
    
# Get Hugging Face credentials from environment variables
email = os.getenv('HF_EMAIL')
password = os.getenv('HF_PASS')
GEMMA_TOKEN = os.getenv("GEMMA_TOKEN")
#print(f"email is {email} and password is {password}", flush=True)

if not (email, password,GEMMA_TOKEN):
    print("no dependacies", flush=True)

"""
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

model_id = "google/gemma-2-2b-it"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
model.config.sliding_window = 4096
model.eval()
"""


tokenizer = AutoTokenizer.from_pretrained(model, token=GEMMA_TOKEN, device=device)

quantization_config = GPTQConfig(
     bits=4,
     group_size=128,
     dataset="c4", # the original datasets used in GPTQ paper [‘wikitext2’,‘c4’,‘c4-new’,‘ptb’,‘ptb-new’]
     desc_act=False,
     tokenizer=tokenizer,
     batch_size=1,
)
quantized=False
if quantized:
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
                                                 token=GEMMA_TOKEN,
                                                 quantization_config=quantization_config,
                                                 device_map=device
                                                 )
else:
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it", 
                                                token=GEMMA_TOKEN,
                                                torch_dtype=torch.float16,
                                                device_map=device
                                                )


app_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

@app.route("/generate_text", methods=["POST"])
def generate_Text():
    data = request.json
    prompt = data.get("prompt", "")
    max_new_tokens = data.get("max_new_tokens", 1000)
    do_sample = data.get("do_sample", True)
    temperature = data.get("temperature", 0.1)
    top_k = data.get("top_k", 50)
    top_p = data.get("top_p", 0.95)

    tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
        prompt, tokenize=False, add_generation_prompt=True)
    outputs = app_pipeline(
        tokenized_prompt,
        max_new_tokens=max_new_tokens,
        do_sample=do_sample,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p
    )

    return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})


if __name__ == "__main__":
    app.run(debug=False, port=8888)

    

"""    
# Flask route to handle incoming chat requests
@app.route('/chat', methods=['POST'])
def chat():
    # Get JSON data from the POST request
    data = request.json
    prompt = data.get('prompt')
    email = data.get('email')
    password = data.get('password')

    print(f"email 2 is {email} and password 2 is {password} and The user wants to Know: {prompt}", flush=True)
    
    if not (password):
        return jsonify({"error": "Missing password"}), 400
    elif not (prompt):
        return jsonify({"error": "Missing prompt"}), 400
    elif not (email):
        return jsonify({"error": "Missing email"}), 400
    else:
        return jsonify({"error": "Missing prompt, email, or password"}), 400

    # Generate the response
    response = generate_response(prompt, email, password)
    
    # Return the response as JSON
    return jsonify({"response": response})

# Function for generating LLM response
def generate_response(prompt_input, email, passwd):
    # Hugging Face Login
    sign = Login(email, passwd)
    cookies = sign.login()
    # Create ChatBot                        
    chatbot = hugchat.ChatBot(cookies=cookies.get_dict())

    # Simple dialogue structure
    string_dialogue = "You are a helpful assistant."
    string_dialogue += f"\n\nUser: {prompt_input}\n\nAssistant: "

    # Generate and return the response
    return chatbot.chat(string_dialogue)

if __name__ == '__main__':
    app.run(debug=True)
"""