import random from typing import Optional from fastapi import FastAPI from pydantic import BaseModel from peft import PeftModel from transformers import LLaMATokenizer, LLaMAForCausalLM, GenerationConfig app = FastAPI() tokenizer = LLaMATokenizer.from_pretrained("decapoda-research/llama-7b-hf") model = LLaMAForCausalLM.from_pretrained( "decapoda-research/llama-7b-hf", load_in_8bit=True, device_map="auto", ) model = PeftModel.from_pretrained(model, "tloen/alpaca-lora-7b") class InputPrompt(BaseModel): instruction: str input: Optional[str] = None class OutputResponse(BaseModel): response: str @app.post("/evaluate") def evaluate(input_prompt: InputPrompt): temperature = 0.9 generation_config = GenerationConfig( temperature=temperature, top_p=0.75, num_beams=1, do_sample=True ) prompt = generate_prompt(input_prompt.instruction, input_prompt.input) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].cuda() generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=256 ) for s in generation_output.sequences: output = tokenizer.decode(s) return OutputResponse(response=output.split("### Response:")[1].strip()) def generate_prompt(instruction, input=None): if input: return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input} ### Response:""" else: return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)