kamran-r123 commited on
Commit
4091744
·
verified ·
1 Parent(s): 4cbaa02

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +11 -29
main.py CHANGED
@@ -3,11 +3,16 @@ from pydantic import BaseModel
3
  import uvicorn
4
  import prompt_style
5
  import os
6
- from huggingface_hub import InferenceClient
7
 
8
 
9
  model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3"
10
- client = InferenceClient(model_id, token=os.environ['HF_TOKEN'])
 
 
 
 
 
11
 
12
  class Item(BaseModel):
13
  prompt: str
@@ -32,35 +37,12 @@ def format_prompt(item: Item):
32
  return messages
33
 
34
  def generate(item: Item):
35
- temperature = float(item.temperature)
36
- if temperature < 1e-2:
37
- temperature = 1e-2
38
- top_p = float(item.top_p)
39
-
40
- generate_kwargs = dict(
41
- temperature=temperature,
42
- max_new_tokens=item.max_new_tokens,
43
- top_p=top_p,
44
- repetition_penalty=item.repetition_penalty,
45
- do_sample=True,
46
- seed=item.seed,
47
- )
48
-
49
  formatted_prompt = format_prompt(item)
50
- stream = client.text_generation(formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False)
51
- output = ""
52
-
53
- for response in stream:
54
- output += response.token.text
55
- return output
56
-
57
- # output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed,
58
- # temperature=item.temperature,
59
- # max_tokens=item.max_new_tokens)
60
-
61
 
62
- # out = output['choices'][0]['message']['content']
63
- # return out
64
 
65
  @app.post("/generate/")
66
  async def generate_text(item: Item):
 
3
  import uvicorn
4
  import prompt_style
5
  import os
6
+ from huggingface_hub import hf_hub_download
7
 
8
 
9
  model_id = "failspy/Meta-Llama-3-8B-Instruct-abliterated-v3"
10
+ filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf"
11
+ # model_path = hf_hub_download(repo_id=model_id, filename="Meta-Llama-3-8B-Instruct-abliterated-v3_q6.gguf", token=os.environ['HF_TOKEN'])
12
+ # model = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=4096, verbose=False)
13
+
14
+ model = Llama.from_pretrained(repo_id=model_id, filename=filename, n_gpu_layers=-1,
15
+ n_ctx=4096, verbose=False, attn_implementation="flash_attention_2")
16
 
17
  class Item(BaseModel):
18
  prompt: str
 
37
  return messages
38
 
39
  def generate(item: Item):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  formatted_prompt = format_prompt(item)
41
+ output = model.create_chat_completion(messages=formatted_prompt, seed=item.seed,
42
+ temperature=item.temperature, ax_tokens=item.max_new_tokens)
 
 
 
 
 
 
 
 
 
43
 
44
+ out = output['choices'][0]['message']['content']
45
+ return out
46
 
47
  @app.post("/generate/")
48
  async def generate_text(item: Item):