kamran-r123 commited on
Commit
f79168b
·
verified ·
1 Parent(s): 8bc2cfb

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +15 -26
main.py CHANGED
@@ -1,19 +1,20 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
- from huggingface_hub import InferenceClient
4
  import uvicorn
5
  import prompt_style
6
  import os
7
 
 
 
8
 
9
- model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3"
10
- client = InferenceClient(token=os.getenv('HF_TOKEN'), model=model_id)
11
 
12
  class Item(BaseModel):
13
  prompt: str
14
  history: list
15
  system_prompt: str
16
- token:str
17
  temperature: float = 0.6
18
  max_new_tokens: int = 1024
19
  top_p: float = 0.95
@@ -26,33 +27,21 @@ def format_prompt(item: Item):
26
  messages = [
27
  {"role": "system", "content": prompt_style.data},
28
  ]
29
- for it in item.history:
30
- messages.append[{"role" : "user", "content": it[0]}]
31
- messages.append[{"role" : "assistant", "content": it[1]}]
 
32
  return messages
33
 
34
  def generate(item: Item):
35
- temperature = float(item.temperature)
36
- if temperature < 1e-2:
37
- temperature = 1e-2
38
- top_p = float(item.top_p)
39
-
40
- generate_kwargs = dict(
41
- temperature=temperature,
42
- max_new_tokens=item.max_new_tokens,
43
- top_p=top_p,
44
- repetition_penalty=item.repetition_penalty,
45
- do_sample=True,
46
- seed=item.seed,
47
- )
48
-
49
  formatted_prompt = format_prompt(item)
50
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
51
- output = ""
 
 
52
 
53
- for response in stream:
54
- output += response.token.text
55
- return output
56
 
57
  @app.post("/generate/")
58
  async def generate_text(item: Item):
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
+ from llama_cpp import Llama
4
  import uvicorn
5
  import prompt_style
6
  import os
7
 
8
+ model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3-GGUF"
9
+ model = Llama.from_pretrained(repo_id=model_id, filename="*-v3_q6.gguf", n_gpu_layers=-1, n_ctx=4096, verbose=False)
10
 
11
+ # model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3"
12
+ # client = InferenceClient(token=os.getenv('HF_TOKEN'), model=model_id)
13
 
14
  class Item(BaseModel):
15
  prompt: str
16
  history: list
17
  system_prompt: str
 
18
  temperature: float = 0.6
19
  max_new_tokens: int = 1024
20
  top_p: float = 0.95
 
27
  messages = [
28
  {"role": "system", "content": prompt_style.data},
29
  ]
30
+ for it in history:
31
+ messages.append({"role" : "user", "content": it[0]})
32
+ messages.append({"role" : "assistant", "content": it[1]})
33
+ messages.append({"role" : "user", "content": item.prompt})
34
  return messages
35
 
36
  def generate(item: Item):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  formatted_prompt = format_prompt(item)
38
+ output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed,
39
+ temperature=item.temperature,
40
+ max_tokens=item.max_new_tokens)
41
+
42
 
43
+ out = output['choices'][0]['message']['content']
44
+ return out
 
45
 
46
  @app.post("/generate/")
47
  async def generate_text(item: Item):