File size: 1,954 Bytes
1a69621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca6fb15
e43cdcf
ca6fb15
7c5a24d
e43cdcf
c9319f3
cc29585
 
 
9d3365a
 
 
7c5a24d
e43cdcf
7c5a24d
6ab5056
7c5a24d
 
9d3365a
ca6fb15
 
7c5a24d
 
5ec5cb4
6ab5056
 
 
7c5a24d
5ec5cb4
6ab5056
9d3365a
 
 
 
 
 
 
 
ca6fb15
9d3365a
 
 
 
 
 
 
5ec5cb4
9d3365a
ca6fb15
 
 
 
 
 
 
 
 
 
 
 
 
 
9d3365a
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
#git+https://github.com/huggingface/transformers
transformers==4.43.1
huggingface_hub
bitsandbytes
accelerate
langchain
torch
flask 
gunicorn
twilio
baseten 
spaces
"""




from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from flask import Flask, request, jsonify
import torch
import os

app = Flask(__name__)

print("Hello welcome to Sema AI", flush=True)  # Flush to ensure immediate output

@app.route("/")
def hello():
    return "hello 🤗, Welcome to Sema AI Chat Service."

# Get Hugging Face credentials from environment variables
HF_TOKEN = os.getenv('HF_TOKEN')

if not HF_TOKEN:
    print("Missing Hugging Face token", flush=True)

model_id = "google/gemma-2-2b-it"
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model with authentication token
tokenizer = AutoTokenizer.from_pretrained(model_id, token=HF_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    token=HF_TOKEN
)

app_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

@app.route("/generate_text", methods=["POST"])
def generate_text():
    data = request.json
    prompt = data.get("prompt", "")
    max_new_tokens = data.get("max_new_tokens", 1000)
    do_sample = data.get("do_sample", True)
    temperature = data.get("temperature", 0.1)
    top_k = data.get("top_k", 50)
    top_p = data.get("top_p", 0.95)
    print(f"{prompt}: ")

    try:
        outputs = app_pipeline(
            prompt,
            max_new_tokens=max_new_tokens,
            do_sample=do_sample,
            temperature=temperature,
            top_k=top_k,
            top_p=top_p
        )
        response_text = outputs[0]["generated_text"]
    except Exception as e:
        return jsonify({"error": str(e)}), 500

    return jsonify({"response": response_text})

if __name__ == "__main__":
    app.run(debug=False, port=8888)