RSPO / app.py
timxiaohangt's picture
add api
6c30c0d
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from vllm import LLM, SamplingParams
import os
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = "2"
MODEL_NAME = "RegularizedSelfPlay/sppo_forward1reverse5-0.1-PromptABC-Mistral-7B-Instruct-SPPO-Iter3" # Example: "meta-llama/Llama-2-7b-chat-hf"
HF_TOKEN = os.getenv("HF_API_TOKEN")
# Load model and tokenizer
tokenizer = LlamaTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=HF_TOKEN)
llm = LLM(
model=MODEL_NAME,
# revision="1296dc8fd9b21e6424c9c305c06db9ae60c03ace",
# tokenizer_revision="1296dc8fd9b21e6424c9c305c06db9ae60c03ace",
tensor_parallel_size=1,
)
tokenizer.pad_token = tokenizer.eos_token
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.9,
seed=2024,
max_tokens=2048,
#max_tokens=64, # set it to higher value like 2048 for proper test
)
def generate_response(prompt):
# inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # Move to GPU
inputs = tokenizer.apply_chat_template(
[
{"role": "user", "content": prompt},
{"role": "assistant", "content": "None"}
],
tokenize=False, add_generate_prompt=True
).split("None")[0]
# outputs = model.generate(**inputs, max_length=512)
response = llm.generate(
inputs,
sampling_params
)[0].outputs[0].text
# response = tokenizer.decode(outputs[0], skip_special_tokens=True)
return response
app = FastAPI()
class PromptRequest(BaseModel):
prompt: str
@app.post("/generate")
def generate_text(request: PromptRequest):
response = generate_response(request.prompt)
return {"response": response}
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
# print(generate_response('hi I like u'))