Spaces:

jikoni
/

llamaSMS

Sleeping

App Files Files Community

Tri4 commited on Aug 11, 2024

Commit

9d3365a

verified ·

1 Parent(s): 6ab5056

Update main.py

Browse files

Files changed (1) hide show

main.py +72 -4

main.py CHANGED Viewed

@@ -12,19 +12,27 @@ app = Flask(__name__)
 print("Hello welcome to Sema AI", flush=True)  # Flush to ensure immediate output
 # Get Hugging Face credentials from environment variables
 email = os.getenv('HF_EMAIL')
 password = os.getenv('HF_PASS')
 GEMMA_TOKEN = os.getenv("GEMMA_TOKEN")
 #print(f"email is {email} and password is {password}", flush=True)
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-model_id = "google/gemma-2-2b-it"
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
@@ -33,11 +41,70 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 model.config.sliding_window = 4096
 model.eval()
-@app.route("/")
-def hello():
-    return "hello 🤗, Welcome to Sema AI Chat Service."
 # Flask route to handle incoming chat requests
 @app.route('/chat', methods=['POST'])
 def chat():
@@ -81,3 +148,4 @@ def generate_response(prompt_input, email, passwd):
 if __name__ == '__main__':
     app.run(debug=True)

 print("Hello welcome to Sema AI", flush=True)  # Flush to ensure immediate output
+@app.route("/")
+def hello():
+    return "hello 🤗, Welcome to Sema AI Chat Service."
 # Get Hugging Face credentials from environment variables
 email = os.getenv('HF_EMAIL')
 password = os.getenv('HF_PASS')
 GEMMA_TOKEN = os.getenv("GEMMA_TOKEN")
 #print(f"email is {email} and password is {password}", flush=True)
+if not (email, password,GEMMA_TOKEN):
+    print("no dependacies", flush=True)
+"""
 MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+model_id = "google/gemma-2-2b-it"
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 tokenizer = GemmaTokenizerFast.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id,
 )
 model.config.sliding_window = 4096
 model.eval()
+"""
+tokenizer = AutoTokenizer.from_pretrained(model, token=GEMMA_TOKEN, device=device)
+quantization_config = GPTQConfig(
+     bits=4,
+     group_size=128,
+     dataset="c4", # the original datasets used in GPTQ paper [‘wikitext2’,‘c4’,‘c4-new’,‘ptb’,‘ptb-new’]
+     desc_act=False,
+     tokenizer=tokenizer,
+     batch_size=1,
+)
+quantized=False
+if quantized:
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
+                                                 token=GEMMA_TOKEN,
+                                                 quantization_config=quantization_config,
+                                                 device_map=device
+                                                 )
+else:
+    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="google/gemma-2-2b-it",
+                                                token=GEMMA_TOKEN,
+                                                torch_dtype=torch.float16,
+                                                device_map=device
+                                                )
+app_pipeline = pipeline(
+    "text-generation",
+    model=model,
+    tokenizer=tokenizer
+)
+@app.route("/generate_text", methods=["POST"])
+def generate_Text():
+    data = request.json
+    prompt = data.get("prompt", "")
+    max_new_tokens = data.get("max_new_tokens", 1000)
+    do_sample = data.get("do_sample", True)
+    temperature = data.get("temperature", 0.1)
+    top_k = data.get("top_k", 50)
+    top_p = data.get("top_p", 0.95)
+    tokenized_prompt = app_pipeline.tokenizer.apply_chat_template(
+        prompt, tokenize=False, add_generation_prompt=True)
+    outputs = app_pipeline(
+        tokenized_prompt,
+        max_new_tokens=max_new_tokens,
+        do_sample=do_sample,
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p
+    )
+    return jsonify({"response": outputs[0]["generated_text"][len(tokenized_prompt):]})
+if __name__ == "__main__":
+    app.run(debug=False, port=8888)
+"""
 # Flask route to handle incoming chat requests
 @app.route('/chat', methods=['POST'])
 def chat():
 if __name__ == '__main__':
     app.run(debug=True)
+"""