ajsbsd commited on
Commit
2df6b47
·
verified ·
1 Parent(s): 4e65a82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -10
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- import torchdo_s
3
  import os
4
  import time
5
 
@@ -26,7 +26,7 @@ MAX_NEW_TOKENS = 256
26
  TEMPERATURE = 0.7
27
  TOP_K = 50
28
  TOP_P = 0.95
29
- DO_SAMPLE = True
30
 
31
  # Global model and tokenizer
32
  model = None
@@ -102,15 +102,13 @@ def predict_chat(message: str, history: list):
102
  prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
103
 
104
  try:
105
- # The do_sample parameter should be passed directly, not as part of the prompt string
106
- # Also, 'stream=True' is crucial for token-by-token output in Gradio
107
  for token in model(
108
  prompt_input,
109
  max_new_tokens=MAX_NEW_TOKENS,
110
  temperature=TEMPERATURE,
111
  top_k=TOP_K,
112
  top_p=TOP_P,
113
- #do_sample=DO_SAMPLE, # Corrected parameter passing
114
  repetition_penalty=1.1,
115
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
116
  stream=True
@@ -127,7 +125,6 @@ def predict_chat(message: str, history: list):
127
  temperature=TEMPERATURE,
128
  top_k=TOP_K,
129
  top_p=TOP_P,
130
- #do_sample=DO_SAMPLE, # Corrected parameter passing
131
  repetition_penalty=1.1,
132
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
133
  )
@@ -145,15 +142,13 @@ def predict_chat(message: str, history: list):
145
  # in the same way ctransformers does directly. For true streaming with HF models,
146
  # you'd often need a custom generation loop or a specific streaming API.
147
  # For this example, we'll generate the full response and then yield it.
148
- # If true token-by-token streaming is critical for the HF model,
149
- # you might need to adjust this part or use a different model.
150
  outputs = model.generate(
151
  inputs,
152
  max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
153
  temperature=TEMPERATURE,
154
  top_k=TOP_K,
155
  top_p=TOP_P,
156
- #do_sample=DO_SAMPLE, # Uncommented for use
157
  pad_token_id=tokenizer.pad_token_id
158
  )
159
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
@@ -199,4 +194,4 @@ if __name__ == "__main__":
199
 
200
  demo.chatbot.value = initial_messages_for_value
201
 
202
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
  import os
4
  import time
5
 
 
26
  TEMPERATURE = 0.7
27
  TOP_K = 50
28
  TOP_P = 0.95
29
+ DO_SAMPLE = True # This parameter is primarily for Hugging Face transformers.Model.generate()
30
 
31
  # Global model and tokenizer
32
  model = None
 
102
  prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
103
 
104
  try:
105
+ # Removed do_sample as it's not accepted by ctransformers.LLM.__call__()
 
106
  for token in model(
107
  prompt_input,
108
  max_new_tokens=MAX_NEW_TOKENS,
109
  temperature=TEMPERATURE,
110
  top_k=TOP_K,
111
  top_p=TOP_P,
 
112
  repetition_penalty=1.1,
113
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
114
  stream=True
 
125
  temperature=TEMPERATURE,
126
  top_k=TOP_K,
127
  top_p=TOP_P,
 
128
  repetition_penalty=1.1,
129
  stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
130
  )
 
142
  # in the same way ctransformers does directly. For true streaming with HF models,
143
  # you'd often need a custom generation loop or a specific streaming API.
144
  # For this example, we'll generate the full response and then yield it.
 
 
145
  outputs = model.generate(
146
  inputs,
147
  max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
148
  temperature=TEMPERATURE,
149
  top_k=TOP_K,
150
  top_p=TOP_P,
151
+ do_sample=DO_SAMPLE, # Uncommented for use
152
  pad_token_id=tokenizer.pad_token_id
153
  )
154
  generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
 
194
 
195
  demo.chatbot.value = initial_messages_for_value
196
 
197
+ demo.launch()