File size: 1,624 Bytes
bee5263
 
f79168b
bee5263
3fbd422
a58b418
bab92d5
3fbd422
bee5263
bab92d5
a347da3
f690b14
bee5263
 
 
 
 
dbcfd8e
 
 
29fbbe7
dbcfd8e
 
 
bee5263
3fbd422
 
 
 
f79168b
 
 
 
3fbd422
bee5263
 
3fbd422
f79168b
 
 
 
3fbd422
f79168b
 
bee5263
 
 
fff1df0
f6819c7
 
 
 
 
bab92d5
08a7f64
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from fastapi import FastAPI
from pydantic import BaseModel
from llama_cpp import Llama
import uvicorn
import prompt_style
import os
from huggingface_hub import hf_hub_download


model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF"
# model_path = hf_hub_download(repo_id=model_id, filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf", token=os.environ['HF_TOKEN'])
# model = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=False)

class Item(BaseModel):
    prompt: str
    history: list
    system_prompt: str
    temperature: float = 0.6
    max_new_tokens: int = 1024
    top_p: float = 0.95
    repetition_penalty: float = 1.0
    seed : int = 42
    
app = FastAPI()

def format_prompt(item: Item):
    messages = [
        {"role": "system", "content": prompt_style.data},
    ]
    for it in history:
        messages.append({"role" : "user", "content": it[0]})
        messages.append({"role" : "assistant", "content": it[1]})
    messages.append({"role" : "user", "content": item.prompt})
    return messages

def generate(item: Item):
    formatted_prompt = format_prompt(item)
    output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed, 
                                          temperature=item.temperature,
                                          max_tokens=item.max_new_tokens)


    out = output['choices'][0]['message']['content']
    return out

@app.post("/generate/")
async def generate_text(item: Item):
    ans = generate(item)
    return {"response": ans}


@app.get("/")
def read_root():
    
    return {"Hello": os.environ['HF_TOKEN']}