from transformers import AutoTokenizer, AutoModelForCausalLM, GPTQConfig, pipeline from flask import Flask, request, jsonify from threading import Thread from typing import Iterator import spaces import torch import os app = Flask(__name__) print("Hello welcome to Sema AI", flush=True) # Flush to ensure immediate output @app.route("/") def hello(): return "hello 🤗, Welcome to Sema AI Chat Service." # Get Hugging Face credentials from environment variables email = os.getenv('HF_EMAIL') password = os.getenv('HF_PASS') GEMMA_TOKEN = os.getenv("GEMMA_TOKEN") #print(f"email is {email} and password is {password}", flush=True) if not (email, password,GEMMA_TOKEN): print("no dependacies", flush=True) """ MAX_MAX_NEW_TOKENS = 2048 DEFAULT_MAX_NEW_TOKENS = 1024 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096")) model_id = "google/gemma-2-2b-it" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") tokenizer = GemmaTokenizerFast.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16, ) model.config.sliding_window = 4096 model.eval() """ tokenizer = AutoTokenizer.from_pretrained(model, token=GEMMA_TOKEN, device=device) quantization_config = GPTQConfig( bits=4, group_size=128, dataset="c4", # the original datasets used in GPTQ paper [‘wikitext2’,‘c4’,‘c4-new’,‘ptb’,‘ptb-new’] desc_act=False, tokenizer=tokenizer, batch_size=1, ) quantized=False if quantized: model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it", token=GEMMA_TOKEN, quantization_config=quantization_config, device_map=device ) else: model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it", token=GEMMA_TOKEN, torch_dtype=torch.float16, device_map=device ) app_pipeline = pipeline( "text-generation", model=model, tokenizer=tokenizer ) @app.route("/generate_text", methods=["POST"]) def generate_Text(): data = request.json prompt = data.get("prompt", "") max_new_tokens = data.get("max_new_tokens", 1000) do_sample = data.get("do_sample", True) temperature = data.get("temperature", 0.1) top_k = data.get("top_k", 50) top_p = data.get("top_p", 0.95) tokenized_prompt = app_pipeline.tokenizer.apply_chat_template( prompt, tokenize=False, add_generation_prompt=True) outputs = app_pipeline( tokenized_prompt, max_new_tokens=max_new_tokens, do_sample=do_sample, temperature=temperature, top_k=top_k, top_p=top_p ) return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]}) if __name__ == "__main__": app.run(debug=False, port=8888) """ # Flask route to handle incoming chat requests @app.route('/chat', methods=['POST']) def chat(): # Get JSON data from the POST request data = request.json prompt = data.get('prompt') email = data.get('email') password = data.get('password') print(f"email 2 is {email} and password 2 is {password} and The user wants to Know: {prompt}", flush=True) if not (password): return jsonify({"error": "Missing password"}), 400 elif not (prompt): return jsonify({"error": "Missing prompt"}), 400 elif not (email): return jsonify({"error": "Missing email"}), 400 else: return jsonify({"error": "Missing prompt, email, or password"}), 400 # Generate the response response = generate_response(prompt, email, password) # Return the response as JSON return jsonify({"response": response}) # Function for generating LLM response def generate_response(prompt_input, email, passwd): # Hugging Face Login sign = Login(email, passwd) cookies = sign.login() # Create ChatBot chatbot = hugchat.ChatBot(cookies=cookies.get_dict()) # Simple dialogue structure string_dialogue = "You are a helpful assistant." string_dialogue += f"\n\nUser: {prompt_input}\n\nAssistant: " # Generate and return the response return chatbot.chat(string_dialogue) if __name__ == '__main__': app.run(debug=True) """