Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
import
|
3 |
import os
|
4 |
import time
|
5 |
|
@@ -26,7 +26,7 @@ MAX_NEW_TOKENS = 256
|
|
26 |
TEMPERATURE = 0.7
|
27 |
TOP_K = 50
|
28 |
TOP_P = 0.95
|
29 |
-
DO_SAMPLE = True
|
30 |
|
31 |
# Global model and tokenizer
|
32 |
model = None
|
@@ -102,15 +102,13 @@ def predict_chat(message: str, history: list):
|
|
102 |
prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
103 |
|
104 |
try:
|
105 |
-
#
|
106 |
-
# Also, 'stream=True' is crucial for token-by-token output in Gradio
|
107 |
for token in model(
|
108 |
prompt_input,
|
109 |
max_new_tokens=MAX_NEW_TOKENS,
|
110 |
temperature=TEMPERATURE,
|
111 |
top_k=TOP_K,
|
112 |
top_p=TOP_P,
|
113 |
-
#do_sample=DO_SAMPLE, # Corrected parameter passing
|
114 |
repetition_penalty=1.1,
|
115 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
116 |
stream=True
|
@@ -127,7 +125,6 @@ def predict_chat(message: str, history: list):
|
|
127 |
temperature=TEMPERATURE,
|
128 |
top_k=TOP_K,
|
129 |
top_p=TOP_P,
|
130 |
-
#do_sample=DO_SAMPLE, # Corrected parameter passing
|
131 |
repetition_penalty=1.1,
|
132 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
133 |
)
|
@@ -145,15 +142,13 @@ def predict_chat(message: str, history: list):
|
|
145 |
# in the same way ctransformers does directly. For true streaming with HF models,
|
146 |
# you'd often need a custom generation loop or a specific streaming API.
|
147 |
# For this example, we'll generate the full response and then yield it.
|
148 |
-
# If true token-by-token streaming is critical for the HF model,
|
149 |
-
# you might need to adjust this part or use a different model.
|
150 |
outputs = model.generate(
|
151 |
inputs,
|
152 |
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
|
153 |
temperature=TEMPERATURE,
|
154 |
top_k=TOP_K,
|
155 |
top_p=TOP_P,
|
156 |
-
|
157 |
pad_token_id=tokenizer.pad_token_id
|
158 |
)
|
159 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
@@ -199,4 +194,4 @@ if __name__ == "__main__":
|
|
199 |
|
200 |
demo.chatbot.value = initial_messages_for_value
|
201 |
|
202 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
import torch
|
3 |
import os
|
4 |
import time
|
5 |
|
|
|
26 |
TEMPERATURE = 0.7
|
27 |
TOP_K = 50
|
28 |
TOP_P = 0.95
|
29 |
+
DO_SAMPLE = True # This parameter is primarily for Hugging Face transformers.Model.generate()
|
30 |
|
31 |
# Global model and tokenizer
|
32 |
model = None
|
|
|
102 |
prompt_input = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
|
103 |
|
104 |
try:
|
105 |
+
# Removed do_sample as it's not accepted by ctransformers.LLM.__call__()
|
|
|
106 |
for token in model(
|
107 |
prompt_input,
|
108 |
max_new_tokens=MAX_NEW_TOKENS,
|
109 |
temperature=TEMPERATURE,
|
110 |
top_k=TOP_K,
|
111 |
top_p=TOP_P,
|
|
|
112 |
repetition_penalty=1.1,
|
113 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"],
|
114 |
stream=True
|
|
|
125 |
temperature=TEMPERATURE,
|
126 |
top_k=TOP_K,
|
127 |
top_p=TOP_P,
|
|
|
128 |
repetition_penalty=1.1,
|
129 |
stop=["User:", "\nUser", "\n#", "\n##", "<|endoftext|>"]
|
130 |
)
|
|
|
142 |
# in the same way ctransformers does directly. For true streaming with HF models,
|
143 |
# you'd often need a custom generation loop or a specific streaming API.
|
144 |
# For this example, we'll generate the full response and then yield it.
|
|
|
|
|
145 |
outputs = model.generate(
|
146 |
inputs,
|
147 |
max_length=inputs.shape[-1] + MAX_NEW_TOKENS,
|
148 |
temperature=TEMPERATURE,
|
149 |
top_k=TOP_K,
|
150 |
top_p=TOP_P,
|
151 |
+
do_sample=DO_SAMPLE, # Uncommented for use
|
152 |
pad_token_id=tokenizer.pad_token_id
|
153 |
)
|
154 |
generated_text = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True).strip()
|
|
|
194 |
|
195 |
demo.chatbot.value = initial_messages_for_value
|
196 |
|
197 |
+
demo.launch()
|