File size: 1,611 Bytes
bee5263 f79168b bee5263 3fbd422 a58b418 bab92d5 3fbd422 bee5263 bab92d5 f0a5811 bee5263 dbcfd8e 29fbbe7 dbcfd8e bee5263 3fbd422 d465d44 f79168b 3fbd422 bee5263 3fbd422 f79168b 3fbd422 f79168b bee5263 fff1df0 f6819c7 bab92d5 f0a5811 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import uvicorn
import prompt_style
import os
from huggingface_hub import hf_hub_download
model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF"
model_path = hf_hub_download(repo_id=model_id, filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf", token=os.environ['HF_TOKEN'])
model = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=False)
class Item(BaseModel):
prompt: str
history: list
system_prompt: str
temperature: float = 0.6
max_new_tokens: int = 1024
top_p: float = 0.95
repetition_penalty: float = 1.0
seed : int = 42
app = FastAPI()
def format_prompt(item: Item):
messages = [
{"role": "system", "content": prompt_style.data},
]
for it in item.history:
messages.append({"role" : "user", "content": it[0]})
messages.append({"role" : "assistant", "content": it[1]})
messages.append({"role" : "user", "content": item.prompt})
return messages
def generate(item: Item):
formatted_prompt = format_prompt(item)
output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed,
temperature=item.temperature,
max_tokens=item.max_new_tokens)
out = output['choices'][0]['message']['content']
return out
@app.post("/generate/")
async def generate_text(item: Item):
ans = generate(item)
return {"response": ans}
@app.get("/")
def read_root():
return {"Hello": "Worlds"} |