from transformers import AutoModelForCausalLM, AutoTokenizer import torch from vllm import LLM, SamplingParams import os from fastapi import FastAPI from pydantic import BaseModel from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID" os.environ['CUDA_VISIBLE_DEVICES'] = "2" MODEL_NAME = "RegularizedSelfPlay/sppo_forward1reverse5-0.1-PromptABC-Mistral-7B-Instruct-SPPO-Iter3" # Example: "meta-llama/Llama-2-7b-chat-hf" HF_TOKEN = os.getenv("HF_API_TOKEN") # Load model and tokenizer tokenizer = LlamaTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", token=HF_TOKEN) llm = LLM( model=MODEL_NAME, # revision="1296dc8fd9b21e6424c9c305c06db9ae60c03ace", # tokenizer_revision="1296dc8fd9b21e6424c9c305c06db9ae60c03ace", tensor_parallel_size=1, ) tokenizer.pad_token = tokenizer.eos_token sampling_params = SamplingParams( temperature=0.7, top_p=0.9, seed=2024, max_tokens=2048, #max_tokens=64, # set it to higher value like 2048 for proper test ) def generate_response(prompt): # inputs = tokenizer(prompt, return_tensors="pt").to("cuda") # Move to GPU inputs = tokenizer.apply_chat_template( [ {"role": "user", "content": prompt}, {"role": "assistant", "content": "None"} ], tokenize=False, add_generate_prompt=True ).split("None")[0] # outputs = model.generate(**inputs, max_length=512) response = llm.generate( inputs, sampling_params )[0].outputs[0].text # response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response app = FastAPI() class PromptRequest(BaseModel): prompt: str @app.post("/generate") def generate_text(request: PromptRequest): response = generate_response(request.prompt) return {"response": response} if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000) # print(generate_response('hi I like u'))