File size: 4,114 Bytes
f156ceb
8d05041
 
 
 
 
 
 
 
611c4ac
8d05041
f156ceb
 
bda6e7a
e165882
f156ceb
611c4ac
921b6d2
b2344d3
f156ceb
a6c4516
 
 
 
b498d8b
 
f156ceb
 
 
 
 
 
 
611c4ac
f156ceb
 
 
611c4ac
 
f156ceb
611c4ac
 
 
f156ceb
 
a6c4516
 
921b6d2
a6c4516
 
 
 
 
 
 
853e734
a6c4516
 
921b6d2
611c4ac
e165882
0252e20
e165882
 
a6c4516
0252e20
611c4ac
a6c4516
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611c4ac
 
 
e165882
 
611c4ac
 
e165882
 
611c4ac
 
 
e165882
 
611c4ac
 
e165882
 
611c4ac
 
f156ceb
a6c4516
f156ceb
853e734
921b6d2
f156ceb
 
a6c4516
 
611c4ac
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# from https://huggingface.co/spaces/iiced/mixtral-46.7b-fastapi/blob/main/main.py
# example of use:
# curl -X POST \
#   -H "Content-Type: application/json" \
#   -d '{
#         "prompt": "What is the capital of France?",
#         "history": [],
#         "system_prompt": "You are a very powerful AI assistant."
#       }' \
#   https://phk0-bai.hf.space/generate/

from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
import uvicorn
import torch
# torch.mps.empty_cache()
# torch.set_num_threads(1)

from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger()  # optional: get a logger instance if you want to customize
logger.info("Hugging Face Transformers download started.")


app = FastAPI()

class Item(BaseModel):
    prompt: str
    history: list
    system_prompt: str
    temperature: float = 0.0
    max_new_tokens: int = 900
    top_p: float = 0.15
    repetition_penalty: float = 1.0

def format_prompt(system, message, history):
    prompt = [{"role": "system", "content": system}] 
    for user_prompt, bot_response in history:
        prompt += {"role": "user", "content": user_prompt}
        prompt += {"role": "assistant", "content": bot_response}
    prompt += {"role": "user", "content": message}
    return prompt

def setup():
    device = "cuda" if torch.cuda.is_available() else "cpu"

    # if torch.backends.mps.is_available():
    #     device = torch.device("mps")
    #     x = torch.ones(1, device=device)
    #     print (x)
    # else:
    #     device="cpu"
    #     print ("MPS device not found.")
    
    # device = "auto"
    # device=torch.device("cpu")
    
    model_path = "ibm-granite/granite-34b-code-instruct-8k"
    print("Loading tokenizer for model: " + model_path, file=sys.stderr)
    tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/.cache/huggingface")
    
    print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
    # drop device_map if running on CPU
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, cache_dir="/.cache/huggingface")
    model.eval()
    
    return model, tokenizer, device

def generate(item: Item, model, tokenizer, device):
    # device = "cuda" if torch.cuda.is_available() else "cpu"

    # model_path = "ibm-granite/granite-34b-code-instruct-8k"
    
    # print("Loading tokenizer for model: " + model_path, file=sys.stderr)
    # tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
    # # drop device_map if running on CPU
    
    # print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
    # model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
    # model.eval()
    
    print("Adapting the input into a template...", file=sys.stderr)
    # change input text as desired
    chat = format_prompt(item.system_prompt, item.prompt, item.history)
    chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    
    print("Tokenizing text", file=sys.stderr)
    # tokenize the text
    input_tokens = tokenizer(chat, return_tensors="pt")
    
    print("Transferring tokens to device: " + device, file=sys.stderr)
    # transfer tokenized inputs to the device
    for i in input_tokens:
        input_tokens[i] = input_tokens[i].to(device)
        
    print("Generating output tokens", file=sys.stderr)
    # generate output tokens
    output = model.generate(**input_tokens, max_new_tokens=900)
    
    print("Decoding output tokens", file=sys.stderr)
    output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
    return output_text

model, tokenizer, device = setup()

# model, tokenizer, device = setup()

@app.post("/generate/")
async def generate_text(item: Item):
    # return {"response": generate(item)}
    return {"response": generate(item, model, tokenizer, device)}

@app.get("/")
async def generate_text_root(item: Item):
    return {"response": "try entry point: /generate/"}