File size: 1,624 Bytes
bee5263 f79168b bee5263 3fbd422 a58b418 bab92d5 3fbd422 bee5263 bab92d5 a347da3 f690b14 bee5263 dbcfd8e 29fbbe7 dbcfd8e bee5263 3fbd422 f79168b 3fbd422 bee5263 3fbd422 f79168b 3fbd422 f79168b bee5263 fff1df0 f6819c7 bab92d5 08a7f64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 |
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import uvicorn
import prompt_style
import os
from huggingface_hub import hf_hub_download
model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF"
# model_path = hf_hub_download(repo_id=model_id, filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf", token=os.environ['HF_TOKEN'])
# model = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=False)
class Item(BaseModel):
prompt: str
history: list
system_prompt: str
temperature: float = 0.6
max_new_tokens: int = 1024
top_p: float = 0.95
repetition_penalty: float = 1.0
seed : int = 42
app = FastAPI()
def format_prompt(item: Item):
messages = [
{"role": "system", "content": prompt_style.data},
]
for it in history:
messages.append({"role" : "user", "content": it[0]})
messages.append({"role" : "assistant", "content": it[1]})
messages.append({"role" : "user", "content": item.prompt})
return messages
def generate(item: Item):
formatted_prompt = format_prompt(item)
output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed,
temperature=item.temperature,
max_tokens=item.max_new_tokens)
out = output['choices'][0]['message']['content']
return out
@app.post("/generate/")
async def generate_text(item: Item):
ans = generate(item)
return {"response": ans}
@app.get("/")
def read_root():
return {"Hello": os.environ['HF_TOKEN']} |