Spaces:
Build error
Build error
File size: 4,114 Bytes
f156ceb 8d05041 611c4ac 8d05041 f156ceb bda6e7a e165882 f156ceb 611c4ac 921b6d2 b2344d3 f156ceb a6c4516 b498d8b f156ceb 611c4ac f156ceb 611c4ac f156ceb 611c4ac f156ceb a6c4516 921b6d2 a6c4516 853e734 a6c4516 921b6d2 611c4ac e165882 0252e20 e165882 a6c4516 0252e20 611c4ac a6c4516 611c4ac e165882 611c4ac e165882 611c4ac e165882 611c4ac e165882 611c4ac f156ceb a6c4516 f156ceb 853e734 921b6d2 f156ceb a6c4516 611c4ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 |
# from https://huggingface.co/spaces/iiced/mixtral-46.7b-fastapi/blob/main/main.py
# example of use:
# curl -X POST \
# -H "Content-Type: application/json" \
# -d '{
# "prompt": "What is the capital of France?",
# "history": [],
# "system_prompt": "You are a very powerful AI assistant."
# }' \
# https://phk0-bai.hf.space/generate/
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import sys
import uvicorn
import torch
# torch.mps.empty_cache()
# torch.set_num_threads(1)
from transformers.utils import logging
logging.set_verbosity_info()
logger = logging.get_logger() # optional: get a logger instance if you want to customize
logger.info("Hugging Face Transformers download started.")
app = FastAPI()
class Item(BaseModel):
prompt: str
history: list
system_prompt: str
temperature: float = 0.0
max_new_tokens: int = 900
top_p: float = 0.15
repetition_penalty: float = 1.0
def format_prompt(system, message, history):
prompt = [{"role": "system", "content": system}]
for user_prompt, bot_response in history:
prompt += {"role": "user", "content": user_prompt}
prompt += {"role": "assistant", "content": bot_response}
prompt += {"role": "user", "content": message}
return prompt
def setup():
device = "cuda" if torch.cuda.is_available() else "cpu"
# if torch.backends.mps.is_available():
# device = torch.device("mps")
# x = torch.ones(1, device=device)
# print (x)
# else:
# device="cpu"
# print ("MPS device not found.")
# device = "auto"
# device=torch.device("cpu")
model_path = "ibm-granite/granite-34b-code-instruct-8k"
print("Loading tokenizer for model: " + model_path, file=sys.stderr)
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/.cache/huggingface")
print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
# drop device_map if running on CPU
model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, cache_dir="/.cache/huggingface")
model.eval()
return model, tokenizer, device
def generate(item: Item, model, tokenizer, device):
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model_path = "ibm-granite/granite-34b-code-instruct-8k"
# print("Loading tokenizer for model: " + model_path, file=sys.stderr)
# tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir="/code/huggingface/transformers")
# # drop device_map if running on CPU
# print("Loading Model for causal LM for model: " + model_path, file=sys.stderr)
# model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device)
# model.eval()
print("Adapting the input into a template...", file=sys.stderr)
# change input text as desired
chat = format_prompt(item.system_prompt, item.prompt, item.history)
chat = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
print("Tokenizing text", file=sys.stderr)
# tokenize the text
input_tokens = tokenizer(chat, return_tensors="pt")
print("Transferring tokens to device: " + device, file=sys.stderr)
# transfer tokenized inputs to the device
for i in input_tokens:
input_tokens[i] = input_tokens[i].to(device)
print("Generating output tokens", file=sys.stderr)
# generate output tokens
output = model.generate(**input_tokens, max_new_tokens=900)
print("Decoding output tokens", file=sys.stderr)
output_text = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
return output_text
model, tokenizer, device = setup()
# model, tokenizer, device = setup()
@app.post("/generate/")
async def generate_text(item: Item):
# return {"response": generate(item)}
return {"response": generate(item, model, tokenizer, device)}
@app.get("/")
async def generate_text_root(item: Item):
return {"response": "try entry point: /generate/"}
|